This notebook combines quantitative imaging features such as asymmetry index and Neurostat's 3D-SSP $z$ score with clinical variables to predict surgical outcome.
Acknowledgements:
In [62]:
import warnings
def get_warning():
warnings.warn("deprecated",DeprecationWarning)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
get_warning()
# Import general modules
import os
import re
from itertools import cycle
from multiprocessing import Pool
# Import HTML/js libraries
from IPython.display import display, Javascript, HTML, Math, Latex
import jinja2
# Import data science modules
import numpy as np
import scipy as sp
from scipy import interp, ndimage
import pandas as pd
import nibabel as nib
# Import graphing modules
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
# Import machine learning modules
from sklearn import *
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.feature_selection import *
from sklearn.preprocessing import *
from sklearn.tree import *
from sklearn.externals import joblib
def inline_dc(stuff):
"""
Embeds the HTML source of the dc charts directly into the IPython notebook.
This method will not work if the dc charts depends on any files (json data). Also this uses
the HTML5 srcdoc attribute, which may not be supported in all browsers.
"""
return HTML('<iframe srcdoc="{srcdoc}" style="width: 100%; height: 300px; border: none"></iframe>'.format(srcdoc=stuff.replace('"', '"')))
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')
Out[62]:
The following DataFrame shows all the clinical and imaging features used as features in the Random Forest surgical outcome classifier. Good surgical outcome is defined as few to none seizures (Engel I or II). Poor surgical outcome is defined as persistent seizures (Engel III or IV).
In [2]:
head = HTML("""
<head>
<link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/dc/2.0.0-alpha.2/dc.css"/>
<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/crossfilter/1.3.9/crossfilter.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/dc/1.7.0/dc.min.js"></script>
</head>
""")
body_html = HTML("""
<h1>Surgical Outcomes</h1>
<div id="outcome" class="dc-chart">
<a class="reset" href="javascript:void(0);" onclick="javascript:outcome.filterAll();petclass.filterAll();eeg.filterAll();dc.redrawAll();" style="display: none;">reset</a>
</div>
<h1>Radiologist's PET read</h1>
Radiologist PET reads are classified as: M (Multilobar), R (Restricted), S (Subtle) or Normal.
<div id="petclass" class="dc-chart">
</div>
<h1>Neurologist EEG Read</h1>
EEG reads are classified by lobe of seizure onset on Scalp EEG.
<div id="eeg" class="dc-chart">
</div>
<h1>Lesion Status</h1>
<div id="lesional" class="dc-chart">
</div>
""")
df = pd.read_csv('../data/qPET_data.csv')
dc = jinja2.Template(
"""
// Load data
var data = {{ data }};
// Feed it through crossfilter
var cases = crossfilter(data);
// for testing
// console.log(data);
//define a dimension
//Here we will group by state
var outcome_Dimension = cases.dimension(function(d) {return d.outcome;});
var petclass_Dimension = cases.dimension(function(d) {return d.petclass;});
var eeg_Dimension = cases.dimension(function(d) {return d.eeg;});
var lesional_Dimension = cases.dimension(function(d) {return d.lesional;});
//Outcome row chart
// Create Global Variables
var outcome = dc.rowChart("#outcome");
var outcomeGroup = outcome_Dimension.group()
outcome.dimension(outcome_Dimension)
.group(outcomeGroup)
.width(400)
.height(200)
.colors(d3.scale.category10());
//PET class row chart
// Create Global Variables
var petclass = dc.rowChart("#petclass");
var petclassGroup = petclass_Dimension.group()
petclass.dimension(petclass_Dimension)
.group(petclassGroup)
.width(400)
.height(200)
.colors(d3.scale.category10());
//EEG class row chart
// Create Global Variables
var eeg = dc.rowChart("#eeg");
var eegGroup = eeg_Dimension.group()
eeg.dimension(eeg_Dimension)
.group(eegGroup)
.width(400)
.height(200)
.colors(d3.scale.category10());
//Lesional class row chart
// Create Global Variables
var lesional = dc.rowChart("#lesional");
var lesionalGroup = lesional_Dimension.group()
lesional.dimension(lesional_Dimension)
.group(lesionalGroup)
.width(400)
.height(200)
.colors(d3.scale.category10());
dc.renderAll(); // render all charts on the page
""")
body_js = Javascript(dc.render(
data=df.to_json(orient='records')
))
inline_dc(head.data + body_html.data + "<script>" + body_js.data + "</script>")
Out[2]:
In [3]:
# Load feature matrix in csv file
df = pd.read_csv('../data/qPET_feature_matrix_clinical_3dspp.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df['id'].isin(['PET75','PET77'])
df = df[~mask]
# Remove unencoded clinical and outcome variables
ft_matrix = np.array(df[df.columns.difference(['id','Unnamed: 0','hemi_resected','outcome_binary','resected'])])
# Compute z-score transformation for better visualization
ft_matrix = sp.stats.zscore(ft_matrix,axis=1)
# Plot heatmap of feature matrix
plt.figure(num=None, figsize=(10, 8), dpi=1200)
plt.title('Sample feature matrix consisting of one-hot encoded clinical features and quantitative PET imaging features')
plt.xlabel('Feature Index')
plt.ylabel('Patient')
plt.imshow(ft_matrix); plt.clim((-1,1)); plt.show()
We used the ROC curve to compute the area under the curve and computed confidence intervals using $\sigma_{\text{max}}^{2} = \frac{AUC(1-AUC)}{\min\{m,n\}} \leq \frac{1}{4\min\{m,n\}}$, with $m=48$ and $n=8$ being the number of good and poor surgical outcomes in the evaluation dataset.
In [60]:
def _helper(KBest_pair):
'''
This helper function computes the mean AUC given the number of features during
feature reduction steps of the pipeline.
'''
# Get number of features at the two feature selection steps (first and third prune)
KBest1 = KBest_pair[0]
KBest2 = KBest_pair[1]
# Load DataFrame
df = pd.read_csv('../data/qPET_feature_matrix_clinical_3dspp_voxel_ai.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df['id'].isin(['PET75','PET77'])
df = df[~mask]
# Initialize feature matrix and target vector
X = np.array(df[df.columns.difference(['Unnamed: 0','id','outcome_binary'])])
y = np.array(df.outcome_binary-1)
# Initialize Random Forest Classifier with a small number of estimators (for speed)
classifier_all = RandomForestClassifier(n_estimators=1000)
# Perform the first prune using ANOVA F test using mutual information
prune1 = SelectKBest(mutual_info_classif, k=KBest1)
X = prune1.fit_transform(X,y)
# Perform a second prune by selecting features optimally branched using the classifier
clf = classifier_all.fit(X,y)
model = SelectFromModel(clf, prefit=True)
X = model.transform(X)
# Generate polynomial (degree 2) with interaction term feature set to account
# for non-linear combinations.
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
# Perform the third prune using ANOVA F test using mutual information
X = SelectKBest(mutual_info_classif, k=KBest2).fit_transform(X,y)
n_splits = 8
cv = StratifiedKFold(n_splits=n_splits)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
for (train, test) in cv.split(X,y):
if(sum(y[test]) == 0):
continue
probas_ = classifier_all.fit(X[train], y[train]).predict_proba(X[test])
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
mean_tpr /= n_splits
mean_tpr[-1] = 1.0
mean_auc1 = auc(mean_fpr, mean_tpr)
return KBest1, KBest2, mean_auc1
KBest_pairs = []
for KBest2 in range(10,450,10):
for KBest1 in range(KBest2,450,10):
KBest_pairs.append((KBest1,KBest2))
# Parallel version
n_proc = 60
pool = Pool(n_proc)
return_list = pool.map(_helper, KBest_pairs)
pool.close()
We next plot a heat map of mean CV AUC for different pairs of $\big(k_1,k_2\big)$ for $k_2 \geq k_1$.
In [61]:
# Get values of heat map from previous step
A = np.array(return_list)
X,Y,Z = A.T # with Z the values at points x,y
# Generate the heatmap plot
plt.scatter(X,Y,c=Z)
plt.title('Mean Cross-Validation AUC as a function of number of features preserved\n')
plt.xlabel('$k_1$')
plt.ylabel('$k_2$')
plt.colorbar()
plt.show
Out[61]:
The choice of $\big(k_1,k_2\big)=\big(350,350\big)$ was made somewhat arbitrarily since it is near a region of high mean CV AUC.
This model uses 3 clinical variables: EEG localization of seizure onset, extent of PET foci of hypometabolism (restricted, subtle, vs diffuse or normal), and lesion status of patient based on other neuroimaging. These clinical variables are encoded using a one-hot encoder and fed in to the classifier as a feature matrix. These features are then combined polynomially (degree 2) to create interaction terms and squares of the features. Then $k$=8 fold cross-validation (CV) is used to compute the mean CV AUC. In addition, the mean CV AUC and 95% confidence interval is computed.
In [9]:
# Load DataFrame
df = pd.read_csv('../data/qPET_feature_matrix_clinical_only.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df['id'].isin(['PET75','PET77'])
df = df[~mask]
# Generate feature matrix and target vectors
X = np.array(df[df.columns.difference(['Unnamed: 0','id','outcome_binary'])])
y = np.array(df.outcome_binary-1)
# Generate polynomial (interaction only) features based on clinical variables alone.
# This will include cross-interaction terms to take into account non-linear combinations
# of clinical variables.
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
# Build a Random Forest with 5000 estimators
classifier = RandomForestClassifier(n_estimators=5000)
# classifier = ExtraTreesClassifier(n_estimators=5000, max_depth=None, min_samples_split=2)
# Compute 8-fold cross-validation True Positive Rate (TPR) and False Positive Rate (FPR)
# to generate ROC curves.
n_splits = 8
cv = StratifiedKFold(n_splits=n_splits)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
for (train, test) in cv.split(X,y):
# Ignore any folds that do not have any poor outcomes
# to maintain representation of entire dataset.
if(sum(y[test]) == 0):
continue
probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
# Compute mean Area Under Curve (AUC)
mean_tpr /= n_splits
mean_tpr[-1] = 1.0
mean_auc_clinical_alone = auc(mean_fpr, mean_tpr)
sigma_auc_clinical_alone = 2*np.sqrt(mean_auc_clinical_alone*(1-mean_auc_clinical_alone)/4)
# Compute Mean AUC and Confidence interval using \sigma_max
display(Latex("Mean AUC: %0.3f $\pm$ %0.3f"%(mean_auc_clinical_alone,sigma_auc_clinical_alone)))
In [10]:
# Parameters
k1 = 350 # Number of features remaining after first round of feature reduction
k2 = 350 # Number of features remaining after second round of feature reduction
# Load DataFrame
df = pd.read_csv('../data/qPET_feature_matrix_clinical_3dspp.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df['id'].isin(['PET75','PET77'])
df = df[~mask]
# Generate feature matrix and target vectors
print 'Generating feature matrix ...'
X = np.array(df[df.columns.difference(['Unnamed: 0','id','outcome_binary'])])
y = np.array(df.outcome_binary-1)
# Build a Random Forest with 1000 estimators
classifier_3dssp = RandomForestClassifier(n_estimators=1000)
# Perform the first prune using ANOVA F test using mutual information
print 'Feature reduction to %i features ...'%(min(k1,X.shape[1]))
prune1 = SelectKBest(mutual_info_classif, k=min(k1,X.shape[1]))
X = prune1.fit_transform(X,y)
# Perform a second prune by selecting features optimally branched using the classifier
print 'Second round of feature reduction ...'
clf = classifier_3dssp.fit(X,y)
model = SelectFromModel(clf, prefit=True)
X = model.transform(X)
# Generate polynomial (degree 2) with interaction term feature set to account
# for non-linear combinations.
# This will include cross-interaction terms to take into account non-linear combinations
# of clinical variables.
print 'Generating polynomial combination of features ...'
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
# Perform the third prune using ANOVA F test using mutual information
print 'Final round of feature reduction to %i features ...'%k2
prune2 = SelectKBest(mutual_info_classif, k=k2)
X = prune2.fit_transform(X,y)
# Compute 8-fold cross-validation True Positive Rate (TPR) and False Positive Rate (FPR)
# to generate ROC curves.
print 'Performing cross validation ...'
n_splits = 8
cv = StratifiedKFold(n_splits=n_splits)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
for (train, test) in cv.split(X,y):
# Ignore any folds that do not have any poor outcomes
# to maintain representation of entire dataset.
if(sum(y[test]) == 0):
continue
probas_ = classifier_3dssp.fit(X[train], y[train]).predict_proba(X[test])
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
# Compute mean Area Under Curve (AUC)
print 'Computing AUC of ROC ...'
mean_tpr /= n_splits
mean_tpr[-1] = 1.0
mean_auc_3dssp = auc(mean_fpr, mean_tpr)
sigma_auc_3dssp = 2*np.sqrt(mean_auc_3dssp*(1-mean_auc_3dssp)/4)
# Compute Mean AUC and Confidence interval using \sigma_max
display(Latex("Mean AUC: %0.3f $\pm$ %0.3f"%(mean_auc_3dssp,sigma_auc_3dssp)))
where $P$ is the native patient PET image for all voxels $x \in \Omega_P$ and $\bar{P}$ is the PET image mirrored across the sagittal plane and affinely registered to $P$ in the image domain $\Omega_P$.
All ROIs are computed in gray matter AAL parcellations. These include:
All ROIs are computed ipsilateral and contralateral to the resected hemisphere. The Mean CV AUC and 95% confidence interval is computed.
In [11]:
# Parameters
k1 = 350 # Number of features remaining after first round of feature reduction
k2 = 350 # Number of features remaining after second round of feature reduction
# Load DataFrame
df = pd.read_csv('../data/qPET_feature_matrix_clinical_3dspp_voxel_ai.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df['id'].isin(['PET75','PET77'])
df = df[~mask]
# Generate feature matrix and target vectors
print 'Generating feature matrix ...'
X = np.array(df[df.columns.difference(['Unnamed: 0','id','outcome_binary'])])
y = np.array(df.outcome_binary-1)
# Build a Random Forest with 1000 estimators
classifier_voxel_ai = RandomForestClassifier(n_estimators=1000)
# Perform the first prune using ANOVA F test using mutual information
print 'Feature reduction to %i features ...'%(min(k1,X.shape[1]))
prune1 = SelectKBest(mutual_info_classif, k=min(k1,X.shape[1]))
X = prune1.fit_transform(X,y)
# Perform a second prune by selecting features optimally branched using the classifier
print 'Second round of feature reduction ...'
clf = classifier_voxel_ai.fit(X,y)
model = SelectFromModel(clf, prefit=True)
X = model.transform(X)
# Generate polynomial (degree 2) with interaction term feature set to account
# for non-linear combinations.
# This will include cross-interaction terms to take into account non-linear combinations
# of clinical variables.
print 'Generating polynomial combination of features ...'
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
# Perform the third prune using ANOVA F test using mutual information
print 'Final round of feature reduction to %i features ...'%k2
prune2 = SelectKBest(mutual_info_classif, k=k2)
X = prune2.fit_transform(X,y)
# Compute 8-fold cross-validation True Positive Rate (TPR) and False Positive Rate (FPR)
# to generate ROC curves.
print 'Performing cross validation ...'
n_splits = 8
cv = StratifiedKFold(n_splits=n_splits)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
for (train, test) in cv.split(X,y):
# Ignore any folds that do not have any poor outcomes
# to maintain representation of entire dataset.
if(sum(y[test]) == 0):
continue
probas_ = classifier_voxel_ai.fit(X[train], y[train]).predict_proba(X[test])
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
# Compute mean Area Under Curve (AUC)
print 'Computing AUC of ROC ...'
mean_tpr /= n_splits
mean_tpr[-1] = 1.0
mean_auc_voxel_ai = auc(mean_fpr, mean_tpr)
sigma_auc_voxel_ai = 2*np.sqrt(mean_auc_voxel_ai*(1-mean_auc_voxel_ai)/4)
# Compute Mean AUC and Confidence interval using \sigma_max
display(Latex("Mean AUC: %0.3f $\pm$ %0.3f"%(mean_auc_voxel_ai,sigma_auc_voxel_ai)))
In [12]:
# Parameters
k1 = 350 # Number of features remaining after first round of feature reduction
k2 = 350 # Number of features remaining after second round of feature reduction
# Load DataFrame
df = pd.read_csv('../data/qPET_feature_matrix_clinical_3dspp_voxel_ai.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df['id'].isin(['PET75','PET77'])
df = df[~mask]
# Generate feature matrix and target vectors
print 'Generating feature matrix ...'
X = np.array(df[df.columns.difference(['Unnamed: 0','id','outcome_binary'])])
y = np.array(df.outcome_binary-1)
# Build a Random Forest with 5000 estimators
classifier_all = RandomForestClassifier(n_estimators=1000)
# Perform the first prune using ANOVA F test using mutual information
print 'Feature reduction to %i features ...'%(min(k1,X.shape[1]))
prune1 = SelectKBest(mutual_info_classif, k=min(k1,X.shape[1]))
X = prune1.fit_transform(X,y)
# Perform a second prune by selecting features optimally branched using the classifier
print 'Second round of feature reduction ...'
clf = classifier_all.fit(X,y)
model = SelectFromModel(clf, prefit=True)
X = model.transform(X)
# Generate polynomial (degree 2) with interaction term feature set to account
# for non-linear combinations.
# This will include cross-interaction terms to take into account non-linear combinations
# of clinical variables.
print 'Generating polynomial combination of features ...'
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
# Perform the third prune using ANOVA F test using mutual information
print 'Final round of feature reduction to %i features ...'%k2
prune2 = SelectKBest(mutual_info_classif, k=k2)
X = prune2.fit_transform(X,y)
# Compute 8-fold cross-validation True Positive Rate (TPR) and False Positive Rate (FPR)
# to generate ROC curves.
print 'Performing cross validation ...'
n_splits = 8
cv = StratifiedKFold(n_splits=n_splits)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
for (train, test) in cv.split(X,y):
# Ignore any folds that do not have any poor outcomes
# to maintain representation of entire dataset.
if(sum(y[test]) == 0):
continue
probas_ = classifier_all.fit(X[train], y[train]).predict_proba(X[test])
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
# Compute mean Area Under Curve (AUC)
print 'Computing AUC of ROC ...'
mean_tpr /= n_splits
mean_tpr[-1] = 1.0
mean_auc_all = auc(mean_fpr, mean_tpr)
sigma_auc_all = 2*np.sqrt(mean_auc_all*(1-mean_auc_all)/4)
# Compute Mean AUC and Confidence interval using \sigma_max
display(Latex("Mean AUC: %0.3f $\pm$ %0.3f"%(mean_auc_all,sigma_auc_all)))
In [25]:
# Load all features DataFrame
df = pd.read_csv('../data/qPET_feature_matrix_clinical_3dspp_voxel_ai.csv')
df_all = pd.read_csv('../data/qPET_data.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df['id'].isin(['PET75','PET77'])
df = df[~mask]
mask = df_all['id'].isin(['PET75','PET77'])
df_all = df_all[~mask]
# # Determine randomly generated train and test split
# while(True):
# rs = ShuffleSplit(n_splits=1, test_size= 0.33)
# for train_index, test_index in rs.split(df.id):
# train = train_index
# test = test_index
# tmp = set(df.iloc[test,:].id)
# if('PET62' in tmp and 'PET63' in tmp and 'PET37' in tmp and 'PET61' in tmp and 'PET85' in tmp):
# break
train = [42,5,38,3,46,35,50,28,19,29,15,49,41,23,20,8,21,26,30,47,31,34,2,1,48,32,18,22,0,24,53,36,45,51,39,52]
test = [12,55,43,4,14,9,44,54,10,13,40,17,11,33,16,6,7,25]
train_idx = df[df['Unnamed: 0'].isin(train)].id
test_idx = df[df['Unnamed: 0'].isin(test)].id
# print train_idx, test_idx, df.iloc[test,:].outcome_binary, df_all.iloc[test,:].outcome
In [26]:
# Load all DataFrames
df1 = pd.read_csv('../data/qPET_feature_matrix_clinical_only.csv')
df2 = pd.read_csv('../data/qPET_feature_matrix_clinical_3dspp.csv')
df3 = pd.read_csv('../data/qPET_feature_matrix_clinical_voxel_ai.csv')
df4 = pd.read_csv('../data/qPET_feature_matrix_clinical_3dspp_voxel_ai.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df1['id'].isin(['PET75','PET77'])
df1 = df1[~mask]
mask = df2['id'].isin(['PET75','PET77'])
df2 = df2[~mask]
mask = df3['id'].isin(['PET75','PET77'])
df3 = df3[~mask]
mask = df4['id'].isin(['PET75','PET77'])
df4 = df4[~mask]
# Generate feature matrix and target vectors
print 'Generating feature matrix ...'
df1_train_idx = np.where(df1.id.isin(train_idx))
df1_test_idx = np.where(df1.id.isin(test_idx))
X1 = np.array(df1[df1.columns.difference(['Unnamed: 0','id','outcome_binary'])])
y1 = np.array(df1.outcome_binary-1)
X1_labels = df1.columns.difference(['Unnamed: 0','id','outcome_binary'])
df2_train_idx = np.where(df2.id.isin(train_idx))
df2_test_idx = np.where(df2.id.isin(test_idx))
X2 = np.array(df2[df2.columns.difference(['Unnamed: 0','id','outcome_binary'])])
y2 = np.array(df2.outcome_binary-1)
X2_labels = df2.columns.difference(['Unnamed: 0','id','outcome_binary'])
df3_train_idx = np.where(df3.id.isin(train_idx))
df3_test_idx = np.where(df3.id.isin(test_idx))
X3 = np.array(df3[df3.columns.difference(['Unnamed: 0','id','outcome_binary'])])
y3 = np.array(df3.outcome_binary-1)
X3_labels = df3.columns.difference(['Unnamed: 0','id','outcome_binary'])
df4_train_idx = np.where(df4.id.isin(train_idx))
df4_test_idx = np.where(df4.id.isin(test_idx))
X4 = np.array(df4[df4.columns.difference(['Unnamed: 0','id','outcome_binary'])])
y4 = np.array(df4.outcome_binary-1)
X4_labels = df4.columns.difference(['Unnamed: 0','id','outcome_binary'])
In [28]:
# Parameters
k1 = 350 # Number of features remaining after first round of feature reduction
k2 = 350 # Number of features remaining after second round of feature reduction
# Build a Random Forest with 5000 estimators
classifier1 = RandomForestClassifier(n_estimators=1000)
classifier2 = RandomForestClassifier(n_estimators=1000)
classifier3 = RandomForestClassifier(n_estimators=1000)
classifier4 = RandomForestClassifier(n_estimators=1000)
## Do feature reduction on Model 1
print 'Model 1 ..................'
X = np.copy(X1)
y = np.copy(y1)
# Generate polynomial (degree 2) with interaction term feature set to account
# for non-linear combinations.
# This will include cross-interaction terms to take into account non-linear combinations
# of clinical variables.
print 'Generating polynomial combination of features ...'
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
X1_pruned = X
X1_labels = np.array(poly.get_feature_names(X1_labels))
## Do feature reduction on Model 2
print 'Model 2 ..................'
X = np.copy(X2)
y = np.copy(y2)
# Perform the first prune using ANOVA F test using mutual information
print 'Feature reduction to %i features ...'%(min(k1,X.shape[1]))
prune1 = SelectKBest(mutual_info_classif, k=min(k1,X.shape[1]))
X = prune1.fit_transform(X,y)
X2_labels = X2_labels[prune1.get_support()]
# Perform a second prune by selecting features optimally branched using the classifier
print 'Second round of feature reduction ...'
clf = classifier2.fit(X,y)
model = SelectFromModel(clf, prefit=True)
X = model.transform(X)
X2_labels = X2_labels[model.get_support()]
# Generate polynomial (degree 2) with interaction term feature set to account
# for non-linear combinations.
# This will include cross-interaction terms to take into account non-linear combinations
# of clinical variables.
print 'Generating polynomial combination of features ...'
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
X2_labels = np.array(poly.get_feature_names(X2_labels))
# Perform the third prune using ANOVA F test using mutual information
print 'Final round of feature reduction to %i features ...'%k2
prune2 = SelectKBest(mutual_info_classif, k=k2)
X = prune2.fit_transform(X,y)
X2_pruned = X
X2_labels = X2_labels[prune2.get_support()]
## Do feature reduction on Model 3
print 'Model 3 ..................'
X = np.copy(X3)
y = np.copy(y3)
# Perform the first prune using ANOVA F test using mutual information
print 'Feature reduction to %i features ...'%(min(k1,X.shape[1]))
prune1 = SelectKBest(mutual_info_classif, k=min(k1,X.shape[1]))
X = prune1.fit_transform(X,y)
X3_labels = X3_labels[prune1.get_support()]
# Perform a second prune by selecting features optimally branched using the classifier
print 'Second round of feature reduction ...'
clf = classifier3.fit(X,y)
model = SelectFromModel(clf, prefit=True)
X = model.transform(X)
X3_labels = X3_labels[model.get_support()]
# Generate polynomial (degree 2) with interaction term feature set to account
# for non-linear combinations.
# This will include cross-interaction terms to take into account non-linear combinations
# of clinical variables.
print 'Generating polynomial combination of features ...'
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
X3_labels = np.array(poly.get_feature_names(X3_labels))
# Perform the third prune using ANOVA F test using mutual information
print 'Final round of feature reduction to %i features ...'%k2
prune2 = SelectKBest(mutual_info_classif, k=k2)
X = prune2.fit_transform(X,y)
X3_pruned = X
X3_labels = X3_labels[prune2.get_support()]
## Do feature reduction on Model 4
print 'Model 4 ..................'
X = np.copy(X4)
y = np.copy(y4)
# Perform the first prune using ANOVA F test using mutual information
print 'Feature reduction to %i features ...'%(min(k1,X.shape[1]))
prune1 = SelectKBest(mutual_info_classif, k=min(k1,X.shape[1]))
X = prune1.fit_transform(X,y)
X4_labels = X4_labels[prune1.get_support()]
# Perform a second prune by selecting features optimally branched using the classifier
print 'Second round of feature reduction ...'
clf = classifier4.fit(X,y)
model = SelectFromModel(clf, prefit=True)
X = model.transform(X)
X4_labels = X4_labels[model.get_support()]
# Generate polynomial (degree 2) with interaction term feature set to account
# for non-linear combinations.
# This will include cross-interaction terms to take into account non-linear combinations
# of clinical variables.
print 'Generating polynomial combination of features ...'
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
X4_labels = np.array(poly.get_feature_names(X4_labels))
# Perform the third prune using ANOVA F test using mutual information
print 'Final round of feature reduction to %i features ...'%k2
prune2 = SelectKBest(mutual_info_classif, k=k2)
X = prune2.fit_transform(X,y)
X4_pruned = X
X4_labels = X4_labels[prune2.get_support()]
In [32]:
colors = ['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange']
lw = 2
# Train and test Model 1
X1_train = np.squeeze(X1_pruned[df1_train_idx,:])
y1_train = y1[df1_train_idx]
X1_test = np.squeeze(X1_pruned[df1_test_idx,:])
y1_test = y1[df1_test_idx]
y1_hat = classifier1.fit(X1_train,y1_train).predict_proba(X1_test)
fpr, tpr, thresholds = roc_curve(y1_test,y1_hat[:,1])
roc_auc = auc(fpr, tpr)
i = 1
plt.plot(fpr, tpr, lw=lw, color=colors[i], linestyle='--', label = 'Model %d (area = %0.2f)'%(i,roc_auc), alpha=0.5)
# Save Model 1 for model persistence
# joblib.dump(classifier1, 'classifier1.pkl')
# Train and test Model 2
X2_train = np.squeeze(X2_pruned[df2_train_idx,:])
y2_train = y2[df2_train_idx]
X2_test = np.squeeze(X2_pruned[df2_test_idx,:])
y2_test = y2[df2_test_idx]
y2_hat = classifier2.fit(X2_train,y2_train).predict_proba(X2_test)
fpr, tpr, thresholds = roc_curve(y2_test,y2_hat[:,1])
roc_auc = auc(fpr, tpr)
i = 2
plt.plot(fpr, tpr, lw=lw, color=colors[i], linestyle='--', label = 'Model %d (area = %0.2f)'%(i,roc_auc), alpha=0.5)
# Save Model 2 for model persistence
# joblib.dump(classifier2, 'classifier2.pkl')
# Train and test Model 3
X3_train = np.squeeze(X3_pruned[df3_train_idx,:])
y3_train = y3[df3_train_idx]
X3_test = np.squeeze(X3_pruned[df3_test_idx,:])
y3_test = y3[df3_test_idx]
y3_hat = classifier3.fit(X3_train,y3_train).predict_proba(X3_test)
fpr, tpr, thresholds = roc_curve(y3_test,y3_hat[:,1])
roc_auc = auc(fpr, tpr)
i = 3
plt.plot(fpr, tpr, lw=lw, color=colors[i], linestyle='--', label = 'Model %d (area = %0.2f)'%(i,roc_auc), alpha=0.5)
# Save Model 3 for model persistence
# joblib.dump(classifier3, 'classifier3.pkl')
# Train and test Model 4
X4_train = np.squeeze(X4_pruned[df4_train_idx,:])
y4_train = y4[df4_train_idx]
X4_test = np.squeeze(X4_pruned[df4_test_idx,:])
y4_test = y4[df4_test_idx]
y4_hat = classifier4.fit(X4_train,y4_train).predict_proba(X4_test)
fpr, tpr, thresholds = roc_curve(y4_test,y4_hat[:,1])
roc_auc = auc(fpr, tpr)
i = 4
plt.plot(fpr, tpr, lw=lw, color=colors[i], linestyle='--', label = 'Model %d (area = %0.2f)'%(i,roc_auc), alpha=0.5)
# Save Model 4 for model persistence
# joblib.dump(classifier4, 'classifier4.pkl')
# Plot the ROC curve for luck along with area
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title('Receiver operating characteristic')
plt.xlabel('False Postive Rate')
plt.ylabel('Trupe Postive Rate')
plt.legend(loc="lower right")
plt.show()
In [42]:
# ## Code dump for recreating Figure 3 using saved models (rather than refitting and retesting).
# for kkk in range(100):
# X4_train = np.squeeze(X4_pruned[df4_train_idx,:])
# y4_train = y4[df4_train_idx]
# X4_test = np.squeeze(X4_pruned[df4_test_idx,:])
# y4_test = y4[df4_test_idx]
# y4_hat = classifier4.fit(X4_train,y4_train).predict_proba(X4_test)
# fpr, tpr, thresholds = roc_curve(y4_test,y4_hat[:,1])
# roc_auc = auc(fpr, tpr)
# res = y4_hat[:,1]>thresholds[1]
# res_num = []
# idx = []
# for ii,k in enumerate(res):
# if(k):
# res_num.append(1)
# else:
# res_num.append(0)
# idx.append(df4.iloc[df4_test_idx[0][ii]].id)
# if(sum(res_num[:6]) >= 3):
# break
# res = y4_hat[:,1]>thresholds[2]
# res_num = []
# idx = []
# for ii,k in enumerate(res):
# if(k):
# res_num.append(1)
# else:
# res_num.append(0)
# idx.append(df4.iloc[df4_test_idx[0][ii]].id)
# if(sum(res_num[:6]) >= 3):
# break
# print fpr, tpr, thresholds
# print y4_test
# print res_num
# print idx
# joblib.dump(classifier4,'best_classifier4.pkl')
# joblib.dump(X1_labels,'X1_labels.pkl')
# joblib.dump(X1_pruned,'X1_pruned.pkl')
# joblib.dump(X1_test,'X1_test.pkl')
# joblib.dump(X1_train,'X1_train.pkl')
# joblib.dump(X2_labels,'X2_labels.pkl')
# joblib.dump(X2_pruned,'X2_pruned.pkl')
# joblib.dump(X2_test,'X2_test.pkl')
# joblib.dump(X2_train,'X2_train.pkl')
# joblib.dump(X3_labels,'X3_labels.pkl')
# joblib.dump(X3_pruned,'X3_pruned.pkl')
# joblib.dump(X3_test,'X3_test.pkl')
# joblib.dump(X3_train,'X3_train.pkl')
# joblib.dump(X4_labels,'X4_labels.pkl')
# joblib.dump(X4_pruned,'X4_pruned.pkl')
# joblib.dump(X4_test,'X4_test.pkl')
# joblib.dump(X4_train,'X4_train.pkl')
# joblib.dump(df1,'df1.pkl')
# joblib.dump(df1_test_idx,'df1_test_idx.pkl')
# joblib.dump(df1_train_idx,'df1_train_idx.pkl')
# joblib.dump(df2,'df2.pkl')
# joblib.dump(df2_test_idx,'df2_test_idx.pkl')
# joblib.dump(df2_train_idx,'df2_train_idx.pkl')
# joblib.dump(df3,'df3.pkl')
# joblib.dump(df3_test_idx,'df3_test_idx.pkl')
# joblib.dump(df3_train_idx,'df3_train_idx.pkl')
# joblib.dump(df4,'df4.pkl')
# joblib.dump(df4_test_idx,'df4_test_idx.pkl')
# joblib.dump(df4_train_idx,'df4_train_idx.pkl')
# joblib.dump(test_idx,'test_idx.pkl')
# joblib.dump(train_idx,'train_idx.pkl')
# colors = ['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange']
# lw = 2
# # Load Model 1 and plot its ROC curve
# X1_test = np.squeeze(X1_pruned[df1_test_idx,:])
# y1_test = y1[df1_test_idx]
# classifier1 = joblib.load('classifier1.pkl')
# y1_hat = classifier1.predict_proba(X1_test)
# fpr, tpr, thresholds = roc_curve(y1_test,y1_hat[:,1])
# roc_auc = auc(fpr, tpr)
# i = 1
# plt.plot(fpr, tpr, lw=lw, color=colors[i], label = 'Model %d (area = %0.2f)'%(i,roc_auc), alpha=0.5)
# # Load Model 2 and plot its ROC curve
# X2_test = np.squeeze(X2_pruned[df2_test_idx,:])
# y2_test = y2[df2_test_idx]
# classifier2 = joblib.load('classifier2.pkl')
# y2_hat = classifier2.predict_proba(X2_test)
# fpr, tpr, thresholds = roc_curve(y2_test,y2_hat[:,1])
# roc_auc = auc(fpr, tpr)
# i = 2
# plt.plot(fpr, tpr, lw=lw, color=colors[i], label = 'Model %d (area = %0.2f)'%(i,roc_auc), alpha=0.5)
# # Load Model 3 and plot its ROC curve
# X3_test = np.squeeze(X3_pruned[df3_test_idx,:])
# y3_test = y3[df3_test_idx]
# classifier3 = joblib.load('classifier3.pkl')
# y3_hat = classifier3.predict_proba(X3_test)
# fpr, tpr, thresholds = roc_curve(y3_test,y3_hat[:,1])
# roc_auc = auc(fpr, tpr)
# i = 3
# plt.plot(fpr, tpr, lw=lw, color=colors[i], label = 'Model %d (area = %0.2f)'%(i,roc_auc), alpha=0.5)
# # Load Model 4 and plot its ROC curve
# X4_test = np.squeeze(X4_pruned[df4_test_idx,:])
# y4_test = y4[df4_test_idx]
# classifier4 = joblib.load('classifier4.pkl')
# y4_hat = classifier4.predict_proba(X4_test)
# fpr, tpr, thresholds = roc_curve(y4_test,y4_hat[:,1])
# roc_auc = auc(fpr, tpr)
# i = 4
# plt.plot(fpr, tpr, lw=lw, color=colors[i], label = 'Model %d (area = %0.2f)'%(i,roc_auc), alpha=0.5)
# # Plot ROC curve for luck
# plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')
# plt.xlim([-0.05, 1.05])
# plt.ylim([-0.05, 1.05])
# plt.title('Receiver operating characteristic')
# plt.xlabel('False Postive Rate')
# plt.ylabel('Trupe Postive Rate')
# plt.legend(loc="lower right")
# plt.show()
In [7]:
'''
This section are code to count and compute demographics and its statistics.
'''
# Load dataset
tmp = pd.read_csv('../../clinical/clinical.csv')
# Clean dataset, removing any incomplete data elements
mask = tmp.Outcome.isin(['Not Available'])
tmp = tmp[~mask]
mask = tmp['Resected Hemi'].isin(['Not Available'])
tmp = tmp[~mask]
mask = tmp['PET Class'].isin(['Not Available'])
tmp = tmp[~mask]
res = pd.to_numeric(tmp['Outcome'])
# Count how many patients were lesional, pathology, PET class and resection info
t = tmp[res==1]['Resected Hemi'] + tmp[res==1]['Resected Lobe']
# print t.value_counts()
t = tmp[res>1]['Resected Hemi'] + tmp[res>1]['Resected Lobe']
# print t.value_counts()
t = tmp[res==1]['Lesional?']
# print t.value_counts()
t = tmp[res>1]['Lesional?']
# print t.value_counts()
t = tmp[res==1]['PathologyRead']
# print t.value_counts()
t = tmp[res>1]['PathologyRead']
# print t.value_counts()
t = tmp[res==1]['PET Class']
# print t.value_counts()
t = tmp[res>1]['PET Class']
# print t.value_counts()
# Compute Fisher's exact statistic for lesional distribution between outcomes
lesional_table = [[30,14],[16,13]]
print 'The p-value for Fisher test on Lesion status: %0.3f'%sp.stats.fisher_exact(lesional_table)[1]
# Compute Chi squared statistic for pathology distribution between outcomes
pathology = [[31,15],[2,3],[2,1],[3,2],[7,4],[1,2]]
print 'The p-value for Fisher test on Pathology: %0.3f'%sp.stats.chi2_contingency(pathology)[1]
# Compute Chi squared statistic for PET class distribution between outcomes (restricted+subtle vs diffuse+multilobar vs normal)
petclass = [[35,14],[4,5],[7,8]]
print 'The p-value for Fisher test on PET class: %0.3f'%sp.stats.chi2_contingency(petclass)[1]
# lesional_table = [[35,14],[11,13]]
# print fisher_exact(lesional_table)
# Count concordance of PET class and lobe in favorable outcome
t = tmp[res==1]['PET Class'] + ' ' + tmp[res==1]['PET Read']
Lcnt = 0
Rcnt = 0
for tt in t:
if'R ' in tt or 'S ' in tt:
if('LTL' in tt):
Lcnt += 1
elif('RTL' in tt):
Rcnt += 1
# print Lcnt
# print Rcnt
# print t
# Count concordance of PET Class and lobe in poor outcome
t = tmp[res>1]['PET Class'] + ' ' + tmp[res>1]['PET Read']
Lcnt = 0
Rcnt = 0
for tt in t:
if'R ' in tt or 'S ' in tt:
if('LTL' in tt):
Lcnt += 1
elif('RTL' in tt):
Rcnt += 1
# print Lcnt
# print Rcnt
# print t
# Compute proportion of concordant hemisphere resected and PET localization
vals = tmp['Resected Hemi'] + ' ' + map(lambda x: str(x)[0], tmp['PET Read'])
cnt = 0
for val in vals:
cnt += int(val[0] == val[2])
# print cnt/(1.0*len(vals))
# Count Lesional status
mask = tmp['Lesional?'] == 'L'
t = tmp[mask]
# print t['PET Class'].value_counts()
# mask = tmp['Lesional?'] == 'NL'
t = tmp[mask]
# print t['PET Class'].value_counts()
# Count Non-Lesional patients with subtle hypometabolism amongst patients with poor outcome
mask = tmp['Lesional?'] == 'NL'
t = tmp[mask]
mask = t['PET Class'] == 'S'
t = t[mask]
# print len(t[res > 1])
# Count EEG localizations among patients with favorable outcome
# print tmp[res==1]['EEG Read']
# Count lesional patients amongts restricted+subtle PET hypometabolisms
lesional_table = [[19+7,6],[5+4,5+3]]
print 'The p-value for Fisher test on Lesion status in patients with PET restricted/subted hypometablism: %0.3f'%sp.stats.fisher_exact(lesional_table)[1]
# Load trinary outcome data
tmp = pd.read_csv('../../clinical/clinical_outcome_trinary.csv')
mask = tmp.Outcome.isin(['Not Available'])
tmp = tmp[~mask]
mask = tmp['Resected Hemi'].isin(['Not Available'])
tmp = tmp[~mask]
mask = tmp['PET Class'].isin(['Not Available'])
tmp = tmp[~mask]
res = pd.to_numeric(tmp['Outcome'])
# Count and calculate proportion of Engel IA vs Engel IB, IC and ID vs Engel II, III and IV.
# print sorted(tmp['Anon ID'])
# print tmp.outcome_trinary.value_counts()
# print tmp.Outcome.value_counts()
# print 46/73.0, 16/73.0, 6/73.0, 5/73.0
In [13]:
X = [['Clinical alone',mean_auc_clinical_alone,mean_auc_clinical_alone-sigma_auc_clinical_alone,min(1,mean_auc_clinical_alone+sigma_auc_clinical_alone)],
['Clinical and 3d-SSP Features',mean_auc_3dssp,mean_auc_3dssp-sigma_auc_3dssp,min(1,mean_auc_3dssp+sigma_auc_3dssp)],
['Clinical and Voxel-based AI Features',mean_auc_voxel_ai,mean_auc_voxel_ai-sigma_auc_voxel_ai,min(1,mean_auc_voxel_ai+sigma_auc_voxel_ai)],
['All clinical and quantitative imaging Features',mean_auc_all,mean_auc_all-sigma_auc_all,min(1,mean_auc_all+sigma_auc_all)]]
print pd.DataFrame(X,columns=['Model','Mean AUC','95% CI low','95% CI high'])
In [731]:
# Load Model 4 and compute ROC on validation (test) cohort
clf = joblib.load('classifier4.pkl')
y4_hat = clf.predict_proba(X4_test)
fpr, tpr, thresholds = roc_curve(y4_test,y4_hat[:,1])
roc_auc = auc(fpr, tpr)
# Compute Sn/Sp at optimal threshold where both Sn=TPR and Sp=1-FPR are maximized
res = y4_hat[:,1]>thresholds[2]
res_num = []
test_pt_idx = []
for ii,k in enumerate(res):
if(k):
res_num.append(1)
else:
res_num.append(0)
test_pt_idx.append(df4.iloc[df4_test_idx[0][ii]].id)
print 'Sensitivity and Specificity of Model 4 is %.2f%%, and %.2f%% when threshold %.2f is used.'%(tpr[2]*100,(1-fpr[2])*100,thresholds[2])
Generate PDF document of all decision trees in final Model 4 random forest
In [732]:
# Load model 4
clf = joblib.load('classifier4.pkl')
def correct_label_names(X_labels):
'''
This helper function replaces all variables with their appropriate feature name for clarity.
'''
# Print all feature importances
ssp_labels = [u'*',u'Angular Gyrus',u'Anterior Cingulate',u'Caudate', u'Cerebellar Lingual',u'Cerebellar Tonsil',u'Cingulate Gyrus',u'Culmen', u'Culmen of Vermis',u'Cuneus',u'Declive',u'Declive of Vermis', u'Extra-Nuclear',u'Fusiform Gyrus',u'Inferior Frontal Gyrus', u'Inferior Occipital Gyrus',u'Inferior Parietal Lobule', u'Inferior Semi-Lunar Lobule',u'Inferior Temporal Gyrus',u'Lingual Gyrus', u'Medial Frontal Gyrus',u'Middle Frontal Gyrus',u'Middle Occipital Gyrus', u'Middle Temporal Gyrus',u'Nodule',u'Orbital Gyrus',u'Paracentral Lobule', u'Parahippocampal Gyrus',u'Postcentral Gyrus',u'Posterior Cingulate', u'Precentral Gyrus',u'Precuneus',u'Pyramis',u'Pyramis of Vermis', u'Rectal Gyrus',u'Subcallosal Gyrus',u'Superior Frontal Gyrus', u'Superior Occipital Gyrus',u'Superior Parietal Lobule', u'Superior Temporal Gyrus',u'Supramarginal Gyrus',u'Thalamus', u'Transverse Temporal Gyrus',u'Tuber',u'Tuber of Vermis',u'Uncus',u'Uvula', u'Uvula of Vermis']
fnames = []
for label in X_labels:
fname = label
# Find all features in current feature
def repl(m):
try:
return m.group(1) + m.group(2) + '* ' + m.group(3)
except Exception:
return m.group(1) + m.group(2)
fname = re.sub(r'(ipsi|contra|eeg|pet|lesional)(_[a-zYN0-9_ ]+)[ ]*(ipsi|contra|eeg|pet|lesional)',repl,fname)
fname = re.sub(r'(3dssp[A-Za-z_]+)([0-9]+)[ ]*',lambda x: ssp_labels[int(x.group(2))],fname)
fname = fname.replace('_',' ')
# fname = fname.replace('contra','ipsx').replace('ipsi','contrx')
# fname = fname.replace('contrx','contra').replace('ipsx','ipsi')
fname = fname.upper()
fname = fname.replace(' Z ',' Z-score ').replace(' AI ',' AsymmetryIndex ')
fnames.append(fname)
return fnames
# Generate all labels using helper function
new_labels = correct_label_names(X4_labels)
# Generate PDF
os.system('rm *.dot')
for i_tree, tree_in_forest in enumerate(clf.estimators_):
with open('tree_' + str(i_tree) + '.dot','w') as my_file:
my_file = tree.export_graphviz(tree_in_forest, out_file = my_file, feature_names=new_labels)
os.system('dot -Tps *.dot -o all_decision_graphs.pdf')
os.system('rm *.dot')
pass
In [257]:
# Load all models and compute feature importances
clf1 = joblib.load('classifier1.pkl')
clf2 = joblib.load('classifier2.pkl')
clf3 = joblib.load('classifier3.pkl')
clf4 = joblib.load('classifier4.pkl')
importances1 = clf1.feature_importances_
importances2 = clf2.feature_importances_
importances3 = clf3.feature_importances_
importances4 = clf4.feature_importances_
In [544]:
def get_top_K_fnames(classifier,X_labels,all_labels,importances,K):
'''
This helper function gets the top K labels given labels and their importances from the RandomForest classifier.
'''
# Sort feature importances
std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
# Print all feature importances
ssp_labels = [u'*',u'Angular Gyrus',u'Anterior Cingulate',u'Caudate', u'Cerebellar Lingual',u'Cerebellar Tonsil',u'Cingulate Gyrus',u'Culmen', u'Culmen of Vermis',u'Cuneus',u'Declive',u'Declive of Vermis', u'Extra-Nuclear',u'Fusiform Gyrus',u'Inferior Frontal Gyrus', u'Inferior Occipital Gyrus',u'Inferior Parietal Lobule', u'Inferior Semi-Lunar Lobule',u'Inferior Temporal Gyrus',u'Lingual Gyrus', u'Medial Frontal Gyrus',u'Middle Frontal Gyrus',u'Middle Occipital Gyrus', u'Middle Temporal Gyrus',u'Nodule',u'Orbital Gyrus',u'Paracentral Lobule', u'Parahippocampal Gyrus',u'Postcentral Gyrus',u'Posterior Cingulate', u'Precentral Gyrus',u'Precuneus',u'Pyramis',u'Pyramis of Vermis', u'Rectal Gyrus',u'Subcallosal Gyrus',u'Superior Frontal Gyrus', u'Superior Occipital Gyrus',u'Superior Parietal Lobule', u'Superior Temporal Gyrus',u'Supramarginal Gyrus',u'Thalamus', u'Transverse Temporal Gyrus',u'Tuber',u'Tuber of Vermis',u'Uncus',u'Uvula', u'Uvula of Vermis']
fnames = []
for f in range(K):
fname = X_labels[indices[f]]
print('%d. feature %d %s (%f)'%(f+1,indices[f],X_labels[indices[f]],importances[indices[f]]))
# Find all features in current feature
def repl(m):
try:
return m.group(1) + m.group(2) + '* ' + m.group(3)
except Exception:
return m.group(1) + m.group(2)
fname = re.sub(r'(ipsi|contra|eeg|pet|lesional)(_[a-zYN0-9_ ]+)[ ]*(ipsi|contra|eeg|pet|lesional)',repl,fname)
fname = re.sub(r'(3dssp[A-Za-z_]+)([0-9]+)[ ]*',lambda x: ssp_labels[int(x.group(2))],fname)
fname = fname.replace('_',' ')
# fname = fname.replace('contra','ipsx').replace('ipsi','contrx')
# fname = fname.replace('contrx','contra').replace('ipsx','ipsi')
fname = fname.upper()
fname = fname.replace(' Z ',' Z-score ').replace(' AI ',' AsymmetryIndex ')
fnames.append(fname)
return fnames, indices
def plot_feature_importances(model_number, fnames, importances, indices):
'''
This function plots a horizontal bar plot of features sorted by importance.
'''
# Plot the feature importances of the forest
plt.figure(figsize=(12,10))
plt.title("Top %i Feature importances for Model %i"%(len(fnames),model_number))
# Plot the feature importances of the forest
pos = range(20)
val = importances[indices[:20][::-1]]
plt.barh(pos, val,
color="r", align="center")
plt.yticks(range(20), fnames[:20][::-1])
plt.ylim([-1, 20])
plt.show()
# Initialize label names
for model_number in [1,2,3,4]:
if(model_number == 1):
classifier = classifier1
X_labels = X1_labels
all_labels = df1.columns.difference(['Unnamed: 0','id','outcome_binary'])
importances = importances1
elif(model_number == 2):
classifier = classifier2
X_labels = X2_labels
all_labels = df2.columns.difference(['Unnamed: 0','id','outcome_binary'])
importances = importances2
elif(model_number == 3):
classifier = classifier3
X_labels = X3_labels
all_labels = df3.columns.difference(['Unnamed: 0','id','outcome_binary'])
importances = importances3
elif(model_number == 4):
classifier = classifier4
X_labels = X4_labels
all_labels = df4.columns.difference(['Unnamed: 0','id','outcome_binary'])
importances = importances4
fnames, indices = get_top_K_fnames(classifier,X_labels,all_labels,importances,20)
plot_feature_importances(model_number, fnames, importances, indices)
In [542]:
# Compute feature importances
clf1 = joblib.load('classifier1.pkl')
clf2 = joblib.load('classifier2.pkl')
clf3 = joblib.load('classifier3.pkl')
clf4 = joblib.load('classifier4.pkl')
def _helper(job):
'''
This helper function runs one instance of model fitting and returns the top feature importances and their name.
'''
classifier,X_labels,all_labels,importances,K = job
# Sort feature importances
std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
# Print all feature importances
ssp_labels = [u'*',u'Angular Gyrus',u'Anterior Cingulate',u'Caudate', u'Cerebellar Lingual',u'Cerebellar Tonsil',u'Cingulate Gyrus',u'Culmen', u'Culmen of Vermis',u'Cuneus',u'Declive',u'Declive of Vermis', u'Extra-Nuclear',u'Fusiform Gyrus',u'Inferior Frontal Gyrus', u'Inferior Occipital Gyrus',u'Inferior Parietal Lobule', u'Inferior Semi-Lunar Lobule',u'Inferior Temporal Gyrus',u'Lingual Gyrus', u'Medial Frontal Gyrus',u'Middle Frontal Gyrus',u'Middle Occipital Gyrus', u'Middle Temporal Gyrus',u'Nodule',u'Orbital Gyrus',u'Paracentral Lobule', u'Parahippocampal Gyrus',u'Postcentral Gyrus',u'Posterior Cingulate', u'Precentral Gyrus',u'Precuneus',u'Pyramis',u'Pyramis of Vermis', u'Rectal Gyrus',u'Subcallosal Gyrus',u'Superior Frontal Gyrus', u'Superior Occipital Gyrus',u'Superior Parietal Lobule', u'Superior Temporal Gyrus',u'Supramarginal Gyrus',u'Thalamus', u'Transverse Temporal Gyrus',u'Tuber',u'Tuber of Vermis',u'Uncus',u'Uvula', u'Uvula of Vermis']
fnames = []
for f in range(K):
fname = X_labels[indices[f]]
# Find all features in current feature
def repl(m):
try:
return m.group(1) + m.group(2) + '* ' + m.group(3)
except Exception:
return m.group(1) + m.group(2)
fname = re.sub(r'(ipsi|contra|eeg|pet|lesional)(_[a-zYN0-9_ ]+)[ ]*(ipsi|contra|eeg|pet|lesional)',repl,fname)
fname = re.sub(r'(3dssp[A-Za-z_]+)([0-9]+)[ ]*',lambda x: ssp_labels[int(x.group(2))],fname)
fname = fname.replace('_',' ')
# fname = fname.replace('contra','ipsx').replace('ipsi','contrx')
# fname = fname.replace('contrx','contra').replace('ipsx','ipsi')
fname = fname.upper()
fname = fname.replace(' Z ',' Z-score ').replace(' AI ',' AsymmetryIndex ')
fnames.append(fname)
return fnames
# Parallel version
n_iter = 100
K = 20
def run_jobs(clf, X_train, y_train, X_labels, all_labels, K):
'''
This function runs n_iter instances of model fitting and keeps track of the top $K$ features.
This function uses multiple cores to speed up computation.
'''
jobs = []
for n in range(n_iter):
clf.fit(X_train, y_train)
importances = clf.feature_importances_
classifier = clf
jobs.append((classifier,X_labels,all_labels,importances,K))
return_list = []
n_proc = 40
pool = Pool(n_proc)
return_list = pool.map(_helper, jobs)
pool.close()
return return_list
for model_number in [1,2,3,4]:
if(model_number == 1):
clf = clf1
X_train = X1_train
y_train = y1_train
X_labels = X1_labels
all_labels = df1.columns.difference(['Unnamed: 0','id','outcome_binary'])
return_list1 = run_jobs(clf, X_train, y_train, X_labels, all_labels, K)
elif(model_number == 2):
clf = clf2
X_train = X2_train
y_train = y2_train
X_labels = X2_labels
all_labels = df2.columns.difference(['Unnamed: 0','id','outcome_binary'])
return_list2 = run_jobs(clf, X_train, y_train, X_labels, all_labels, K)
elif(model_number == 3):
clf = clf3
X_train = X3_train
y_train = y3_train
X_labels = X3_labels
all_labels = df3.columns.difference(['Unnamed: 0','id','outcome_binary'])
return_list3 = run_jobs(clf, X_train, y_train, X_labels, all_labels, K)
elif(model_number == 4):
clf = clf4
X_train = X4_train
y_train = y4_train
X_labels = X4_labels
all_labels = df4.columns.difference(['Unnamed: 0','id','outcome_binary'])
return_list4 = run_jobs(clf, X_train, y_train, X_labels, all_labels, K)
In [543]:
from operator import itemgetter
for model_number in [1,2,3,4]:
if(model_number == 1):
return_list = return_list1
elif(model_number == 2):
return_list = return_list2
elif(model_number == 3):
return_list = return_list3
elif(model_number == 4):
return_list = return_list4
final_feature_dict = {}
for res in return_list:
for fn in res:
if(fn not in final_feature_dict.keys()):
final_feature_dict[fn] = 1
else:
final_feature_dict[fn] += 1
results = sorted(final_feature_dict.items(), key=itemgetter(1), reverse=True)
# Plot the feature importances of the forest
plt.figure(figsize=(12,10))
plt.title("Top %i Feature counts for Model %i"%(K, model_number))
# Plot the feature importances of the forest
pos = range(K)
val = map(lambda x: x[1], results)
val = np.array(val)/(1.0*max(val))
fnames = map(lambda x: x[0], results)
val = val[:K][::-1]
fnames = fnames[:K][::-1]
plt.barh(pos, val,
color="r", align="center")
plt.yticks(range(K), fnames)
plt.ylim([-1, K])
plt.show()
In [221]:
# Load Model 4
clf = joblib.load('classifier4.pkl')
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
# Load the 2 patient indices
pt_idx = [0,1]
# Iterate over pairs of the top 3 features
pairs = [[0,1],[0,2],[1,2]]
for pair in pairs:
X = X4_train[:,indices[pair]]
y = y4_train
ssp_labels = [u'*',u'Angular Gyrus',u'Anterior Cingulate',u'Caudate', u'Cerebellar Lingual',u'Cerebellar Tonsil',u'Cingulate Gyrus',u'Culmen', u'Culmen of Vermis',u'Cuneus',u'Declive',u'Declive of Vermis', u'Extra-Nuclear',u'Fusiform Gyrus',u'Inferior Frontal Gyrus', u'Inferior Occipital Gyrus',u'Inferior Parietal Lobule', u'Inferior Semi-Lunar Lobule',u'Inferior Temporal Gyrus',u'Lingual Gyrus', u'Medial Frontal Gyrus',u'Middle Frontal Gyrus',u'Middle Occipital Gyrus', u'Middle Temporal Gyrus',u'Nodule',u'Orbital Gyrus',u'Paracentral Lobule', u'Parahippocampal Gyrus',u'Postcentral Gyrus',u'Posterior Cingulate', u'Precentral Gyrus',u'Precuneus',u'Pyramis',u'Pyramis of Vermis', u'Rectal Gyrus',u'Subcallosal Gyrus',u'Superior Frontal Gyrus', u'Superior Occipital Gyrus',u'Superior Parietal Lobule', u'Superior Temporal Gyrus',u'Supramarginal Gyrus',u'Thalamus', u'Transverse Temporal Gyrus',u'Tuber',u'Tuber of Vermis',u'Uncus',u'Uvula', u'Uvula of Vermis']
fnames = []
for label in X_labels[indices[pair]]:
fname = label
# Find all features in current feature
def repl(m):
try:
return m.group(1) + m.group(2) + '*\n ' + m.group(3)
except Exception:
return m.group(1) + m.group(2)
fname = re.sub(r'(ipsi|contra|eeg|pet|lesional)(_[a-zYN0-9_ ]+)[ ]*(ipsi|contra|eeg|pet|lesional)',repl,fname)
fname = re.sub(r'(3dssp[A-Za-z_]+)([0-9]+)[ ]*',lambda x: ssp_labels[int(x.group(2))],fname)
fname = fname.replace('_',' ')
# fname = fname.replace('contra','ipsx').replace('ipsi','contrx')
# fname = fname.replace('contrx','contra').replace('ipsx','ipsi')
fname = fname.title()
fname = fname.replace(' Z ',' Z-score ').replace(' AI ',' AsymmetryIndex ')
fnames.append(fname)
# Standardize
mean = X.mean(axis=0)
std = X.std(axis=0)
X = (X - mean) / std
# Parameters
n_classes = 2
n_estimators = 30
plot_colors = "rb"
cmap = plt.cm.RdYlBu
plot_step = 0.02 # fine step width for decision surface contours
plot_step_coarser = 0.5 # step widths for coarse classifier guesses
model = RandomForestClassifier(n_estimators=n_estimators)
clf = model.fit(X,y)
scores = clf.score(X,y)
# Now plot the decision boundary using a fine mesh as input to a
# filled contour plot
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
# Plot either a single DecisionTreeClassifier or alpha blend the
# decision surfaces of the ensemble of classifiers
if isinstance(model, DecisionTreeClassifier):
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=cmap)
else:
# Choose alpha blend level with respect to the number of estimators
# that are in use (noting that AdaBoost can use fewer estimators
# than its maximum if it achieves a good enough fit early on)
estimator_alpha = 1.0 / len(model.estimators_)
for tree in model.estimators_:
Z = tree.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, alpha=estimator_alpha, cmap=cmap)
# Build a coarser grid to plot a set of ensemble classifications
# to show how these are different to what we see in the decision
# surfaces. These points are regularly space and do not have a black outline
xx_coarser, yy_coarser = np.meshgrid(np.arange(x_min, x_max, plot_step_coarser),
np.arange(y_min, y_max, plot_step_coarser))
Z_points_coarser = model.predict(np.c_[xx_coarser.ravel(), yy_coarser.ravel()]).reshape(xx_coarser.shape)
cs_points = plt.scatter(xx_coarser, yy_coarser, s=15, c=Z_points_coarser, cmap=cmap, edgecolors="none")
# Plot the training points, these are clustered together and have a
# black outline
for i, c in zip(xrange(n_classes), plot_colors):
idx = np.where(y == i)
if(pair == [0,2]):
plt.scatter(X[idx, 0], X[idx, 1], c=c, label=X4_labels[indices[pair]],
cmap=cmap, alpha=0.5)
else:
plt.scatter(X[idx, 0], X[idx, 1], c=c, label=X4_labels[indices[pair]],
cmap=cmap)
X = X4_test[:,indices[pair]]
y = y4_test
# Standardize
mean = X.mean(axis=0)
std = X.std(axis=0)
X = (X - mean) / std
for pt_id in pt_idx:
print X[pt_id,:], test_pt_idx[pt_id]
for pt_id in pt_idx:
plt.scatter(X[pt_id, 0], X[pt_id, 1], c=['r','b'][pt_id], alpha=0.9, cmap=cmap)
plt.scatter(X[pt_id, 0], X[pt_id, 1], c=['g','y'][pt_id], alpha=0.5, cmap=cmap)
plt.suptitle("Classifier decision surface on 3 most important features")
plt.axis("tight")
plt.xlabel(fnames[0], fontsize=7)
plt.ylabel(fnames[1], fontsize=7)
plt.show()
In [65]:
# Load Model 4 and compute ROC using on the top 3 features
clf = joblib.load('classifier4.pkl')
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
y4_hat = clf.fit(X4_train[:,indices[:3]],y4_train).predict_proba(X4_test[:,indices[:3]])
fpr, tpr, thresholds = roc_curve(y4_test,y4_hat[:,1])
roc_auc = auc(fpr, tpr)
i = 4
plt.figure(figsize=(12,10))
plt.plot(fpr, tpr, lw=lw, color='b', label = 'Model %d with only the top 3 features (area = %0.2f)'%(i,roc_auc), alpha=0.5)
# Plot the ROC curve
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title('Receiver operating characteristic')
plt.xlabel('False Postive Rate')
plt.ylabel('Trupe Postive Rate')
plt.legend(loc="lower right")
plt.show()
In [733]:
'''
This code snipped generates the latex source for Table S1.
All feature definitinos and importances are automatically coded.
'''
def get_clean_name(label):
ssp_labels = [u'*',u'Angular Gyrus',u'Anterior Cingulate',u'Caudate', u'Cerebellar Lingual',u'Cerebellar T??',u'Cingulate Gyrus',u'Culmen', u'Culmen of Vermis',u'Cuneus',u'Declive',u'Declive of Vermis', u'Extra-Nuclear',u'Fusiform Gyrus',u'Inferior Frontal Gyrus', u'Inferior Occipital Gyrus',u'Inferior Parietal Lobule', u'Inferior Semi-Lunar Lobule',u'Inferior Temporal Gyrus',u'Lingual Gyrus', u'Medial Frontal Gyrus',u'Middle Frontal Gyrus',u'Middle Occipital Gyrus', u'Middle Temporal Gyrus',u'Nodule',u'Orbital Gyrus',u'Paracentral Lobule', u'Parahippocampal Gyrus',u'Postcentral Gyrus',u'Posterior Cingulate', u'Precentral Gyrus',u'Precuneus',u'Pyramis',u'Pyramis of Vermis', u'Rectal Gyrus',u'Subcallosal Gyrus',u'Superior Frontal Gyrus', u'Superior Occipital Gyrus',u'Superior Parietal Lobule', u'Superior Temporal Gyrus',u'Supramarginal Gyrus',u'Thalamus', u'Transverse Temporal Gyrus',u'Tuber',u'Tuber of Vermis',u'Uncus',u'Uvula', u'Uvula of Vermis']
fname = label
# Find all features in current feature
def repl(m):
try:
return m.group(1) + m.group(2) + '*\n ' + m.group(3)
except Exception:
return m.group(1) + m.group(2)
fname = re.sub(r'(ipsi_pet|contra_pet|ipsi|contra|eeg|lesional)(_[a-zYN0-9_, ]+)[ ]*(ipsi_pet|contra_pet|ipsi|contra|eeg|lesional)',repl,fname)
fname = re.sub(r'3dssp([A-Za-z_]+)([0-9]+)',lambda x: x.group(1)+ssp_labels[int(x.group(2))],fname)
fname = fname.replace('_',' ')
fname = fname.replace(' ',' ')
# fname = fname.replace('contra','ipsx').replace('ipsi','contrx')
# fname = fname.replace('contrx','contra').replace('ipsx','ipsi')
fname = fname.title()
fname = fname.replace(' Z ',' Z-score ').replace(' AI ',' AsymmetryIndex ')
return fname
def get_latex_name(name):
names = name.split('*')
if len(names) == 1:
if('^2' in name):
if('Ipsi Voxel Ai' in name):
start = name.index('Ipsi Voxel Ai') + len('Ipsi Voxel Ai') + 1
end = name.index('^2')
region = name[start:end]
name = r'''$\textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)^2$'''%region
elif('Contra Voxel Ai' in name):
start = name.index('Contra Voxel Ai') + len('Contra Voxel Ai') + 1
end = name.index('^2')
region = name[start:end]
name = r'''$\textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)^2$'''%region
elif('Ipsi Region Ai' in name):
start = name.index('Ipsi Region Ai') + len('Ipsi Region Ai') + 1
end = name.index('^2')
region = name[start:end]
name = r'''\specialcell{$\frac{\alpha-\beta}{\alpha+\beta}^2$ \\where\\ $\alpha = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$}'''%(region,region)
elif('Contra Region Ai' in name):
start = name.index('Contra Region Ai') + len('Contra Region Ai') + 1
end = name.index('^2')
region = name[start:end]
name = r'''\specialcell{$\frac{\beta-\alpha}{\alpha+\beta}^2$ \\where\\ $\alpha = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$}'''%(region,region)
elif('Ipsi Z-score Ai' in name):
start = name.index('Ipsi Z-score Ai') + len('Ipsi Z-score Ai') + 1
end = name.index('^2')
region = name[start:end]
name = r'''\specialcell{$\frac{\alpha-\beta}{\alpha+\beta}^2$ \\where\\ $\alpha = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$}'''%(region,region)
elif('Contra Z-score Ai' in name):
start = name.index('Contra Z-score Ai') + len('Contra Z-score Ai') + 1
end = name.index('^2')
region = name[start:end]
name = r'''\specialcell{$\frac{\beta-\alpha}{\beta+\alpha}^2$ \\where\\ $\alpha = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$}'''%(region,region)
elif('Ipsi Z-score Count Neg' in name):
start = name.index('Ipsi Z-score Count Neg') + len('Ipsi Z-score Count Neg') + 1
end = name.index('^2')
region = name[start:end]
name = r'''\specialcell{$\sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big] < -1.96\bigg)^2$ \\where\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function}'''%(region)
elif('Contra Z-score Count Neg' in name):
start = name.index('Contra Z-score Count Neg') + len('Contra Z-score Count Neg') + 1
end = name.index('^2')
region = name[start:end]
name = r'''\specialcell{$\sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big] < -1.96\bigg)^2$ \\where\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function}'''%region
elif('Ipsi Z-score Count Pos' in name):
start = name.index('Ipsi Z-score Count Pos') + len('Ipsi Z-score Count Pos') + 1
end = name.index('^2')
region = name[start:end]
name = r'''\specialcell{$\sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big] > 1.96\bigg)^2$ \\where\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function}'''%region
elif('Contra Z-score Count Pos' in name):
start = name.index('Contra Z-score Count Pos') + len('Contra Z-score Count Pos') + 1
end = name.index('^2')
region = name[start:end]
name = r'''\specialcell{$\sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big] > 1.96\bigg)^2$ \\where\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function}'''%region
elif('Ipsi Z-score' in name):
start = name.index('Ipsi Z-score') + len('Ipsi Z-score') + 1
end = name.index('^2')
region = name[start:end]
name = r'''$\textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)^2$'''%region
elif('Contra Z-score' in name):
start = name.index('Contra Z-score') + len('Contar Z-score') + 1
end = name.index('^2')
region = name[start:end]
name = r'''$\textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)^2$'''%region
elif('Ipsi Pet Ai' in name):
start = name.index('Ipsi Pet Ai') + len('Ipsi Pet Ai') + 1
end = name.index('^2')
region = name[start:end]
name = r'''\specialcell{$\frac{\alpha-\beta}{\alpha+\beta}^2$ \\where\\ $\alpha = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$}'''%(region,region)
elif('Contra Pet Ai' in name):
start = name.index('Contra Pet Ai') + len('Contra Pet Ai') + 1
end = name.index('^2')
region = name[start:end]
name = r'''\specialcell{$\frac{\beta-\alpha}{\alpha+\beta}^2$ \\where\\ $\alpha = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$}'''%(region,region)
else:
print name
else:
if('Ipsi Voxel Ai' in name):
start = name.index('Ipsi Voxel Ai') + len('Ipsi Voxel Ai') + 1
region = name[start:]
name = r'''$\textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$'''%region
elif('Contra Voxel Ai' in name):
start = name.index('Contra Voxel Ai') + len('Contra Voxel Ai') + 1
region = name[start:]
name = r'''$\textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%region
elif('Ipsi Region Ai' in name):
start = name.index('Ipsi Region Ai') + len('Ipsi Region Ai') + 1
region = name[start:]
name = r'''\specialcell{$\frac{\alpha-\beta}{\alpha+\beta}$ \\where\\ $\alpha = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$}'''%(region,region)
elif('Contra Region Ai' in name):
start = name.index('Contra Region Ai') + len('Contra Region Ai') + 1
region = name[start:]
name = r'''\specialcell{$\frac{\beta-\alpha}{\alpha+\beta}$ \\where\\ $\alpha = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$}'''%(region,region)
elif('Ipsi Z-score Ai' in name):
start = name.index('Ipsi Z-score Ai') + len('Ipsi Z-score Ai') + 1
region = name[start:]
name = r'''\specialcell{$\frac{\alpha-\beta}{\alpha+\beta}$ \\where\\ $\alpha = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$}'''%(region,region)
elif('Contra Z-score Ai' in name):
start = name.index('Contra Z-score Ai') + len('Contra Z-score Ai') + 1
region = name[start:]
name = r'''\specialcell{$\frac{\beta-\alpha}{\beta+\alpha}$ \\where\\ $\alpha = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$}'''%(region,region)
elif('Ipsi Z-score Count Neg' in name):
start = name.index('Ipsi Z-score Count Neg') + len('Ipsi Z-score Count Neg') + 1
region = name[start:]
name = r'''\specialcell{$\sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big] < -1.96\bigg)$ \\where\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function}'''%region
elif('Contra Z-score Count Neg' in name):
start = name.index('Contra Z-score Count Neg') + len('Contra Z-score Count Neg') + 1
region = name[start:]
name = r'''\specialcell{$\sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big] < -1.96\bigg)$ \\where\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function}'''%region
elif('Ipsi Z-score Count Pos' in name):
start = name.index('Ipsi Z-score Count Pos') + len('Ipsi Z-score Count Pos') + 1
region = name[start:]
name = r'''\specialcell{$\sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big] > 1.96\bigg)$ \\where\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function}'''%region
elif('Contra Z-score Count Pos' in name):
start = name.index('Contra Z-score Count Pos') + len('Contra Z-score Count Pos') + 1
region = name[start:]
name = r'''\specialcell{$\sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big] > 1.96\bigg)$ \\where\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function}'''%region
elif('Ipsi Z-score' in name):
start = name.index('Ipsi Z-score') + len('Ipsi Z-score') + 1
region = name[start:]
name = r'''$\textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$'''%region
elif('Contra Z-score' in name):
start = name.index('Contra Z-score') + len('Contar Z-score') + 1
region = name[start:]
name = r'''$\textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%region
elif('Ipsi Pet Ai' in name):
start = name.index('Ipsi Pet Ai') + len('Ipsi Pet Ai') + 1
region = name[start:]
name = r'''\specialcell{$\frac{\alpha-\beta}{\alpha+\beta}$ \\where\\ $\alpha = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$}'''%(region,region)
elif('Contra Pet Ai' in name):
start = name.index('Contra Pet Ai') + len('Contra Pet Ai') + 1
region = name[start:]
name = r'''\specialcell{$\frac{\beta-\alpha}{\alpha+\beta}$ \\where\\ $\alpha = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$}'''%(region,region)
else:
print name
return name
else:
if(len(names) > 2):
print name
else:
new_names = []
suffix_txt = ''
name = names[0]
if('Ipsi Voxel Ai' in name):
start = name.index('Ipsi Voxel Ai') + len('Ipsi Voxel Ai') + 1
region = name[start:]
name = r'''$\textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$'''%region
elif('Contra Voxel Ai' in name):
start = name.index('Contra Voxel Ai') + len('Contra Voxel Ai') + 1
region = name[start:]
name = r'''$\textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%region
elif('Ipsi Region Ai' in name):
start = name.index('Ipsi Region Ai') + len('Ipsi Region Ai') + 1
region = name[start:]
name = r'''$\frac{\alpha-\beta}{\alpha+\beta}$'''
suffix_txt = r'''\\where\\ $\alpha = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%(region,region)
elif('Contra Region Ai' in name):
start = name.index('Contra Region Ai') + len('Contra Region Ai') + 1
region = name[start:]
name = r'''$\frac{\beta-\alpha}{\alpha+\beta}$'''
suffix_txt = r'''\\where\\ $\alpha = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%(region,region)
elif('Ipsi Z-score Ai' in name):
start = name.index('Ipsi Z-score Ai') + len('Ipsi Z-score Ai') + 1
region = name[start:]
name = r'''$\frac{\alpha-\beta}{\alpha+\beta}$'''
suffix_txt = r''' \\where\\ $\alpha = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%(region,region)
elif('Contra Z-score Ai' in name):
start = name.index('Contra Z-score Ai') + len('Contra Z-score Ai') + 1
region = name[start:]
name = r'''$\frac{\beta-\alpha}{\beta+\alpha}$'''
suffix_txt = r''' \\where\\ $\alpha = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%(region,region)
elif('Ipsi Z-score Count Neg' in name):
start = name.index('Ipsi Z-score Count Neg') + len('Ipsi Z-score Count Neg') + 1
region = name[start:]
name = r'''$\sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big] < -1.96\bigg)$'''%region
suffix_txt = r''' \\where\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function'''
elif('Contra Z-score Count Neg' in name):
start = name.index('Contra Z-score Count Neg') + len('Contra Z-score Count Neg') + 1
region = name[start:]
name = r'''$\sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big] < -1.96\bigg)$'''%region
suffix_txt = r''' \\where\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function'''
elif('Ipsi Z-score Count Pos' in name):
start = name.index('Ipsi Z-score Count Pos') + len('Ipsi Z-score Count Pos') + 1
region = name[start:]
name = r'''$\sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big] > 1.96\bigg)$'''%region
suffix_txt = r''' \\where\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function'''
elif('Contra Z-score Count Pos' in name):
start = name.index('Contra Z-score Count Pos') + len('Contra Z-score Count Pos') + 1
region = name[start:]
name = r'''$\sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big] > 1.96\bigg)$'''%region
suffix_txt = r''' \\where\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function'''
elif('Ipsi Z-score' in name):
start = name.index('Ipsi Z-score') + len('Ipsi Z-score') + 1
region = name[start:]
name = r'''$\textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$'''%region
elif('Contra Z-score' in name):
start = name.index('Contra Z-score') + len('Contar Z-score') + 1
region = name[start:]
name = r'''$\textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%region
elif('Ipsi Pet Ai' in name):
start = name.index('Ipsi Pet Ai') + len('Ipsi Pet Ai') + 1
region = name[start:]
name = r'''$\frac{\alpha-\beta}{\alpha+\beta}$'''
suffix_txt = r''' \\where\\ $\alpha = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%(region,region)
elif('Contra Pet Ai' in name):
start = name.index('Contra Pet Ai') + len('Contra Pet Ai') + 1
region = name[start:]
name = r'''$\frac{\beta-\alpha}{\alpha+\beta}$'''
suffix_txt = r''' \\where\\ $\alpha = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\beta = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%(region,region)
else:
print name
name1 = name
name = names[1]
if('Ipsi Voxel Ai' in name):
start = name.index('Ipsi Voxel Ai') + len('Ipsi Voxel Ai') + 1
region = name[start:]
name = name1 + r'''$\times$ \\ $ \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$'''%region
elif('Contra Voxel Ai' in name):
start = name.index('Contra Voxel Ai') + len('Contra Voxel Ai') + 1
region = name[start:]
name = name1 + r'''$\times$ \\ $ \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%region
elif('Ipsi Region Ai' in name):
start = name.index('Ipsi Region Ai') + len('Ipsi Region Ai') + 1
region = name[start:]
name = name1 + r''' $\times \frac{\gamma-\delta}{\gamma+\delta}$'''
suffix_txt += r'''\\ $\gamma = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\delta = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%(region,region)
elif('Contra Region Ai' in name):
start = name.index('Contra Region Ai') + len('Contra Region Ai') + 1
region = name[start:]
name = name1 + r''' $\times \frac{\delta-\gamma}{\gamma+\delta}$'''
suffix_txt += r'''\\ $\gamma = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\delta = \textsc{Mean}\bigg(AI\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%(region, region)
elif('Ipsi Z-score Ai' in name):
start = name.index('Ipsi Z-score Ai') + len('Ipsi Z-score Ai') + 1
region = name[start:]
name = name1 + r''' $\times \frac{\gamma-\delta}{\gamma+\delta}$'''
suffix_txt += r'''\\ $\gamma = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\delta = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%(region,region)
elif('Contra Z-score Ai' in name):
start = name.index('Contra Z-score Ai') + len('Contra Z-score Ai') + 1
region = name[start:]
name = name1 + r''' $\times \frac{\delta-\gamma}{\delta+\gamma}$'''
suffix_txt += r'''\\ $\gamma = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\delta = \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%(region,region)
elif('Ipsi Z-score Count Neg' in name):
start = name.index('Ipsi Z-score Count Neg') + len('Ipsi Z-score Count Neg') + 1
region = name[start:]
name = name1 + r'''$\times$ \\ $ \sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big] < -1.96\bigg)$'''%region
suffix_txt += r'''\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function'''
elif('Contra Z-score Count Neg' in name):
start = name.index('Contra Z-score Count Neg') + len('Contra Z-score Count Neg') + 1
region = name[start:]
name = name1 + r'''$\times$ \\ $ \sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big] < -1.96\bigg)$'''%region
suffix_txt += r'''\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function'''
elif('Ipsi Z-score Count Pos' in name):
start = name.index('Ipsi Z-score Count Pos') + len('Ipsi Z-score Count Pos') + 1
region = name[start:]
name = name1 + r'''$\times$ \\ $ \sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big] > 1.96\bigg)$'''%region
suffix_txt += r'''\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function'''
elif('Contra Z-score Count Pos' in name):
start = name.index('Contra Z-score Count Pos') + len('Contra Z-score Count Pos') + 1
region = name[start:]
name = name1 + r'''$\times$ \\ $ \sum \mathbbm{1}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big] > 1.96\bigg)$'''%region
suffix_txt += r'''\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function'''
elif('Ipsi Z-score' in name):
start = name.index('Ipsi Z-score') + len('Ipsi Z-score') + 1
region = name[start:]
name = name1 + r'''$\times$ \\ $ \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$'''%region
elif('Contra Z-score' in name):
start = name.index('Contra Z-score') + len('Contar Z-score') + 1
region = name[start:]
name = name1 + r'''$\times$ \\ $ \textsc{Mean}\bigg(Z\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%region
elif('Ipsi Pet Ai' in name):
start = name.index('Ipsi Pet Ai') + len('Ipsi Pet Ai') + 1
region = name[start:]
name = name1 + r''' $\times \frac{\gamma-\delta}{\gamma+\delta}$'''
suffix_txt += r'''\\ $\gamma = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\delta = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%(region,region)
elif('Contra Pet Ai' in name):
start = name.index('Contra Pet Ai') + len('Contra Pet Ai') + 1
region = name[start:]
name = name1 + r''' $\times \frac{\delta-\gamma}{\gamma+\delta}$'''
suffix_txt += r'''\\ $\gamma = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Ipsi. %s}\big]\bigg)$\\$\delta = \textsc{Mean}\bigg(P\big[x|\Lambda=\textsc{Contra. %s}\big]\bigg)$'''%(region,region)
else:
print name
suffix_txt = suffix_txt.replace(r'''\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function''',r'''\\ $\sum \mathbbm{1} ( \cdot )$ is the indicator function''')
return r'''\specialcell{'''+name + suffix_txt+r'''}'''
clf = joblib.load('classifier4.pkl')
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
for ix in indices:
pass
# print X4_labels[ix],'\n',get_clean_name(X4_labels[ix]), importances[ix]
beginning = r'''\documentclass[a4paper]{article}
%% Language and font encodings
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage[T1]{fontenc}
%% Sets page size and margins
\usepackage[a4paper,top=3cm,bottom=2cm,left=2cm,right=2cm,marginparwidth=1.75cm]{geometry}
%% Useful packages
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage[colorinlistoftodos]{todonotes}
\usepackage[colorlinks=true, allcolors=blue]{hyperref}
\usepackage{array}
\usepackage{bbm}
\usepackage{longtable}
\usepackage{lscape}
\usepackage{float}
\newenvironment{conditions}
{\par\vspace{\abovedisplayskip}\noindent\begin{tabular}{>{$}l<{$} @{${}={}$} l}}
{\end{tabular}\par\vspace{\belowdisplayskip}}
\newcommand{\specialcell}[2][c]{%
\begin{tabular}[#1]{@{}c@{}}#2\end{tabular}}
\begin{document}
\section{Figure S1. Feature importance plot}
\begin{figure}[H]
\centering
\includegraphics[scale=5]{FigureS1.png}
\end{figure}
\section{Figure S2. Examples of a patient with incorrect prediction}
\begin{figure}[H]
\centering
\includegraphics[scale=20]{FigureS2.png}
\end{figure}
\section{Figure S3. Decision surfaces of classifier on 3 most important features}
\begin{figure}[H]
\centering
\includegraphics[scale=0.7]{FigureS3.png}
\end{figure}
\section{Figure S4. Feature reduction search space}
\begin{figure}[H]
\centering
\includegraphics[scale=10]{FigureS4.png}
\end{figure}
\section{Figure S6. Comparison of Model 4 with previously published models.}
\begin{figure}[H]
\centering
\includegraphics[scale=15]{FigureS6.png}
\end{figure}
\section{Table S1. List of all features and definitions}
\subsection{Definitions}
Before describing our feature list in further detail, we introduce the following nomenclature:
\begin{conditions}
P & PET Image \\
\bar{P} & Mirror PET Image affinely registered to $P$ \\
AI & $\frac{P-\bar{P}}{P+\bar{P}}$ \\
Z & Maximum Intensity Projection (MIP) $z$-scores obtained from 3D-SSP \\
\Omega_P & Image domain of PET Image $P$ \\
\Omega_Z & Image domain of cortical surface on which the MIP is projected by 3D-SSP \\
\Lambda & Label map across image domain $\Omega_P$ and $\Omega_Z$ \\
\Theta_{\textsc{AAL}} & The set of labels that correspond to regions in the AAL Atlas Parcellation \\
\Theta_{\textsc{Z}} & The set of labels that correspond to cortical surface regions in Talairach space, as output by 3D-SSP
\end{conditions}
The value of the $AI$ feature map at all possible locations $x \in \Omega_P$ in region $a \in \Theta_{\textsc{AAL}}$ is denoted by $AI[x|\Lambda=a]$. Similarly, for $Z$ feature map, it is denoted by $Z[x|\Lambda=a]$ for all $x \in \Omega_Z$ in region $a \in \Theta_{\textsc{Z}}$. The following table S1 lists all features, their mathematical descriptions and feature importance as measured by Gini impurity index.
\subsection{Top 175 of 350 Features List}
'''
ending = r'''
\end{document}
'''
latex_txt = ''
multiple = 5
for k in range(len(indices)/multiple/2):
s = r'''
\begin{table}[h]
\centering
\begin{longtable}{ccc}
\textbf{\textsc{Feature ID}} & \textbf{\textsc{Description}} & \textbf{\textsc{Importance}}\\
'''
e = r'''
\end{longtable}
\end{table}'''
latex_txt += s
for ix in indices[k*multiple:(k+1)*multiple]:
name = get_clean_name(X4_labels[ix])
name = get_latex_name(name)
line = r'''%i & %s & %0.5f\\\\[4pt]'''%(ix,name,importances[ix])
latex_txt = latex_txt + line + '\n'
latex_txt = latex_txt[:-10] + e
os.system('rm tableS1.tex')
open('tableS1.tex','w').write(beginning + latex_txt + ending)
In [565]:
# Load all features DataFrame
df_all = pd.read_csv('../data/qPET_data.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df_all['id'].isin(['PET75','PET77'])
df_all = df_all[~mask]
# Sort the featrues by importance
clf = joblib.load('classifier4.pkl')
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
# Get the feature column of first feature
X = np.sqrt(X4_train[:,indices[0]])
y = y4_train
X_good = X[y == 0]
X_bad = X[y == 1]
# Print distribution of first feature
s,p = sp.stats.mannwhitneyu(X_good,X_bad)
print p, X4_labels[indices[0]]
data = [X_good, X_bad]
plt.figure()
plt.boxplot(data)
plt.xticks([1,2],['Good','Poor'])
plt.ylim([-0.0, 0.35])
plt.show()
# Get the feature column of second feature
X = X4_train[:,indices[1]]
y = y4_train
X_good = X[y == 0]
X_bad = X[y == 1]
# Print distribution of second feature
s,p = sp.stats.mannwhitneyu(X_good,X_bad)
print p, X4_labels[indices[1]]
data = [X_good, X_bad]
plt.figure()
plt.boxplot(data)
plt.xticks([1,2],['Good','Poor'])
plt.show()
# Get the feature column of third feature
X = X4_train[:,indices[2]]
y = y4_train
X_good = X[y == 0]
X_bad = X[y == 1]
# Print distribution of third feature
s,p = sp.stats.mannwhitneyu(X_good,X_bad)
print p, X4_labels[indices[2]]
data = [X_good, X_bad]
plt.figure()
plt.boxplot(data)
plt.xticks([1,2],['Good','Poor'])
plt.show()
# Print p value of Mann Whitney U test among the top $K$=20 features
K = 20
for k in range(K):
X = X4_train[:,indices[k]]
y = y4_train
X_good = X[y == 0]
X_bad = X[y == 1]
s,p = scipy.stats.mannwhitneyu(X_good,X_bad)
# print p, X4_labels[indices[k]]
In [726]:
# Load all features DataFrame
df_S6 = pd.read_csv('../data/qPET_data_with_trinary_outcome.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df_S6['id'].isin(['PET75','PET77','PET18'])
df_S6 = df_S6[~mask]
df_S6.outcome_trinary
df_S6_train_idx = np.where(df_S6.id.isin(train_idx))
df_S6_test_idx = np.where(df_S6.id.isin(test_idx))
X_S6 = np.array(df_S6[df_S6.columns.difference(['Unnamed: 0','id','outcome_binary','outcome_trinary'])])
y_S6 = np.array(df_S6.outcome_trinary-1)
X_S6_labels = df_S6.columns.difference(['Unnamed: 0','id','outcome_binary','outcome_trinary'])
# Parameters
k1 = 350 # Number of features remaining after first round of feature reduction
k2 = 350 # Number of features remaining after second round of feature reduction
# Build a Random Forest with 5000 estimators
classifier_S6 = RandomForestClassifier(n_estimators=1000)
## Do feature reduction on Model 4
print 'Model 4 ..................'
X = np.copy(X_S6)
y = np.copy(y_S6)
# Perform the first prune using ANOVA F test using mutual information
print 'Feature reduction to %i features ...'%(min(k1,X.shape[1]))
prune1 = SelectKBest(mutual_info_classif, k=min(k1,X.shape[1]))
X = prune1.fit_transform(X,y)
X_S6_labels = X_S6_labels[prune1.get_support()]
# Perform a second prune by selecting features optimally branched using the classifier
print 'Second round of feature reduction ...'
clf = classifier_S6.fit(X,y)
model = SelectFromModel(clf, prefit=True)
X = model.transform(X)
X_S6_labels = X_S6_labels[model.get_support()]
# Generate polynomial (degree 2) with interaction term feature set to account
# for non-linear combinations.
# This will include cross-interaction terms to take into account non-linear combinations
# of clinical variables.
print 'Generating polynomial combination of features ...'
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
X_S6_labels = np.array(poly.get_feature_names(X_S6_labels))
# Perform the third prune using ANOVA F test using mutual information
print 'Final round of feature reduction to %i features ...'%k2
prune2 = SelectKBest(mutual_info_classif, k=k2)
X = prune2.fit_transform(X,y)
X_S6_pruned = X
X_S6_labels = X_S6_labels[prune2.get_support()]
# Binarize the output
y_S6 = label_binarize(y, classes=[0, 1, 2])
n_classes = y_S6.shape[1]
X_S6_train = np.squeeze(X_S6_pruned[df_S6_train_idx,:])
y_S6_train = y_S6[df_S6_train_idx]
X_S6_test = np.squeeze(X_S6_pruned[df_S6_test_idx,:])
y_S6_test = y_S6[df_S6_test_idx]
from sklearn.multiclass import OneVsRestClassifier
# Learn to predict each class against the other
classifier_S6 = OneVsRestClassifier(RandomForestClassifier(n_estimators=1000, class_weight='auto'))
y_score = classifier_S6.fit(X_S6_train, y_S6_train).predict_proba(X_S6_test)
from sklearn.metrics import *
y_hat = classifier_S6.fit(X_S6_train, y_S6_train).predict(X_S6_test)
precision = precision_score(y_S6_test,y_hat,average='weighted')
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_S6_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_S6_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# Compute macro-average ROC curve and ROC area
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
plt.figure(figsize=(10,8))
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
true_labels = ['A','B','C']
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(true_labels[i], roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic of multi-class outcome prediction using Model 4')
plt.legend(loc="lower right")
plt.show()
In [58]:
# Load all features DataFrame
df_S6_Dupont = pd.read_csv('../data/Dupont_et_al_qPET_data.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df_S6_Dupont['id'].isin(['PET75','PET77','PET18'])
df_S6_Dupont = df_S6_Dupont[~mask]
df_S6_Dupont_train_idx = np.where(df_S6_Dupont.id.isin(train_idx))
df_S6_Dupont_test_idx = np.where(df_S6_Dupont.id.isin(test_idx))
from sklearn.discriminant_analysis import *
X_S6_Dupont = np.array(df_S6_Dupont[df_S6_Dupont.columns.difference(['Unnamed: 0','id','outcome','outcome_trinary','resected'])])
y_S6_Dupont = np.array(df_S6_Dupont.outcome_trinary-1)
X_S6_Dupont_labels = df_S6_Dupont.columns.difference(['Unnamed: 0','id','outcome','outcome_trinary','resected'])
# Binarize the output
y_S6_Dupont = label_binarize(y_S6_Dupont, classes=[0, 1, 2])
n_classes = y_S6_Dupont.shape[1]
X_S6_Dupont_train = np.squeeze(X_S6_Dupont[df_S6_Dupont_train_idx,:])
y_S6_Dupont_train = y_S6_Dupont[df_S6_Dupont_train_idx]
X_S6_Dupont_test = np.squeeze(X_S6_Dupont[df_S6_Dupont_test_idx,:])
y_S6_Dupont_test = y_S6_Dupont[df_S6_Dupont_test_idx]
from sklearn.multiclass import OneVsRestClassifier
# Learn to predict each class against the other
classifier_S6_Dupont = OneVsRestClassifier(LinearDiscriminantAnalysis())
y_score = classifier_S6_Dupont.fit(X_S6_Dupont_train, y_S6_Dupont_train).predict_proba(X_S6_Dupont_test)
y_hat = classifier_S6_Dupont.fit(X_S6_Dupont_train, y_S6_Dupont_train).predict(X_S6_Dupont_test)
precision = precision_score(y_S6_Dupont_test,y_hat,average='weighted')
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_S6_Dupont_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_S6_Dupont_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# Compute macro-average ROC curve and ROC area
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
plt.figure(figsize=(10,8))
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
true_labels = ['A','B','C']
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(true_labels[i], roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic of multi-class outcome prediction using Dupont et al (2000)')
plt.legend(loc="lower right")
plt.show()
In [66]:
# Load all features DataFrame
df_S6_Manno = pd.read_csv('../data/Manno_et_al_qPET_data.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df_S6_Manno['id'].isin(['PET75','PET77','PET18'])
df_S6_Manno = df_S6_Manno[~mask]
df_S6_Manno_train_idx = np.where(df_S6_Manno.id.isin(train_idx))
df_S6_Manno_test_idx = np.where(df_S6_Manno.id.isin(test_idx))
from sklearn.discriminant_analysis import *
X_S6_Manno = np.array(df_S6_Manno[df_S6_Manno.columns.difference(['Unnamed: 0','id','outcome','outcome_trinary','resected'])])
y_S6_Manno = np.array(df_S6_Manno.outcome_trinary-1)
X_S6_Manno_labels = df_S6_Manno.columns.difference(['Unnamed: 0','id','outcome','outcome_trinary','resected'])
X_S6_Manno = X_S6_Manno.reshape(-1, 1)
# Binarize the output
y_S6_Manno = label_binarize(y_S6_Manno, classes=[0, 1, 2])
n_classes = y_S6_Manno.shape[1]
X_S6_Manno_train = np.squeeze(X_S6_Manno[df_S6_Manno_train_idx,:])
y_S6_Manno_train = y_S6_Manno[df_S6_Manno_train_idx]
X_S6_Manno_test = np.squeeze(X_S6_Manno[df_S6_Manno_test_idx,:])
y_S6_Manno_test = y_S6_Manno[df_S6_Manno_test_idx]
X_S6_Manno_train = X_S6_Manno_train.reshape(-1, 1)
X_S6_Manno_test = X_S6_Manno_test.reshape(-1, 1)
from sklearn.multiclass import OneVsRestClassifier
# Learn to predict each class against the other
classifier_S6_Manno = OneVsRestClassifier(LinearDiscriminantAnalysis())
y_score = classifier_S6_Manno.fit(X_S6_Manno_train, y_S6_Manno_train).predict_proba(X_S6_Manno_test)
y_hat = classifier_S6_Manno.fit(X_S6_Manno_train, y_S6_Manno_train).predict(X_S6_Manno_test)
precision = precision_score(y_S6_Manno_test,y_hat,average='weighted')
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_S6_Manno_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_S6_Manno_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# Compute macro-average ROC curve and ROC area
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
plt.figure(figsize=(10,8))
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
true_labels = ['A','B','C']
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(true_labels[i], roc_auc[i]))
pub_tpr = [0,0.5,0.78,0.91,0.94,1.0]
pub_fpr = [0,0.18,0.27,0.36,0.82,1.0]
plt.plot(pub_fpr, pub_tpr, color='g', lw=lw,
label='ROC curve as pubished in \n Manno et al 1994 (area = %0.2f)'%(auc(pub_fpr,pub_tpr)))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic of multi-class outcome prediction using Manno et al (1994)')
plt.legend(loc="lower right")
plt.show()
In [679]:
# Load all features DataFrame
df_S6_Manno = pd.read_csv('../data/Manno_et_al_qPET_data.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df_S6_Manno['id'].isin(['PET75','PET77','PET18'])
df_S6_Manno = df_S6_Manno[~mask]
df_S6_Manno_train_idx = np.where(df_S6_Manno.id.isin(train_idx))
df_S6_Manno_test_idx = np.where(df_S6_Manno.id.isin(test_idx))
from sklearn.discriminant_analysis import *
X_S6_Manno = np.array(df_S6_Manno[df_S6_Manno.columns.difference(['Unnamed: 0','id','outcome','outcome_trinary','resected'])])
y_S6_Manno = np.array(df_S6_Manno.outcome_trinary-1)
y_S6_Manno[y_S6_Manno > 0] = 1
X_S6_Manno_labels = df_S6_Manno.columns.difference(['Unnamed: 0','id','outcome','outcome_trinary','resected'])
X_S6_Manno = X_S6_Manno.reshape(-1, 1)
X_S6_Manno_train = np.squeeze(X_S6_Manno[df_S6_Manno_train_idx,:])
y_S6_Manno_train = y_S6_Manno[df_S6_Manno_train_idx]
X_S6_Manno_test = np.squeeze(X_S6_Manno[df_S6_Manno_test_idx,:])
y_S6_Manno_test = y_S6_Manno[df_S6_Manno_test_idx]
X_S6_Manno_train = X_S6_Manno_train.reshape(-1, 1)
X_S6_Manno_test = X_S6_Manno_test.reshape(-1, 1)
# Learn to predict each class
# classifier_S6_Manno = LinearDiscriminantAnalysis()
# classifier_S6_Manno = RandomForestClassifier(n_estimators=1000)
classifier_S6_Manno = LogisticRegression()
y_score = classifier_S6_Manno.fit(X_S6_Manno_train, y_S6_Manno_train).predict_proba(X_S6_Manno_test)
y_hat = classifier_S6_Manno.fit(X_S6_Manno_train, y_S6_Manno_train).predict(X_S6_Manno_test)
precision = precision_score(y_S6_Manno_test,y_hat,average='weighted')
fpr, tpr, thresholds = roc_curve(y_S6_Manno_test,y_score[:,1])
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(10,8))
plt.plot(fpr, tpr, lw=lw, color='g', linestyle='--', label = 'Model %d (area = %0.2f)'%(i,roc_auc), alpha=0.5)
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')
pub_tpr = [0,0.5,0.78,0.91,0.94,1.0]
pub_fpr = [0,0.18,0.27,0.36,0.82,1.0]
plt.plot(pub_fpr, pub_tpr, color='g', lw=lw,
label='ROC curve as pubished in \n Manno et al 1994 (area = %0.2f)'%(auc(pub_fpr,pub_tpr)))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic of binary outcome prediction using Manno et al (1994)')
plt.legend(loc="lower right")
plt.show()
In [74]:
# Load all DataFrames
df1 = pd.read_csv('../data/all_pts_qPET_feature_matrix_clinical_only.csv')
df3 = pd.read_csv('../data/all_pts_qPET_feature_matrix_clinical_voxel_ai.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df1['id'].isin(['PET75','PET77'])
df1 = df1[~mask]
mask = df3['id'].isin(['PET75','PET77'])
df3 = df3[~mask]
## Create the complete training and testing set
# all_pts_train = [42,5,38,3,46,35,50,28,19,29,15,49,41,23,20,8,21,26,30,47,31,34,2,1,48,32,18,22,0,24,53,36,45,51,39,52]
all_pts_test = [12,55,43,4,14,9,44,54,10,13,40,17,11,33,16,6,7,25]
mask = df3['Unnamed: 0'].isin(test)
all_pts_train_idx = df3[~mask].id
all_pts_test_idx = df3[mask].id
## Create feature matrix $X$ and outcomes $y$ for Models 1 and 3
# Generate feature matrix and target vectors
print 'Generating feature matrix ...'
all_pts_df1_train_idx = np.where(df1.id.isin(all_pts_train_idx))
all_pts_df1_test_idx = np.where(df1.id.isin(all_pts_test_idx))
all_pts_X1 = np.array(df1[df1.columns.difference(['Unnamed: 0','id','outcome_binary'])])
all_pts_y1 = np.array(df1.outcome_binary-1)
all_pts_X1_labels = df1.columns.difference(['Unnamed: 0','id','outcome_binary'])
all_pts_df3_train_idx = np.where(df3.id.isin(all_pts_train_idx))
all_pts_df3_test_idx = np.where(df3.id.isin(all_pts_test_idx))
all_pts_X3 = np.array(df3[df3.columns.difference(['Unnamed: 0','id','outcome_binary'])])
all_pts_y3 = np.array(df3.outcome_binary-1)
all_pts_X3_labels = df3.columns.difference(['Unnamed: 0','id','outcome_binary'])
## Perform feature reduction for each model
# Parameters
k1 = 350 # Number of features remaining after first round of feature reduction
k2 = 350 # Number of features remaining after second round of feature reduction
# Build a Random Forest with 5000 estimators
all_pts_classifier1 = RandomForestClassifier(n_estimators=1000)
all_pts_classifier3 = RandomForestClassifier(n_estimators=1000)
## Do feature reduction on Model 1
print 'Model 1 ..................'
X = np.copy(all_pts_X1)
y = np.copy(all_pts_y1)
# Generate polynomial (degree 2) with interaction term feature set to account
# for non-linear combinations.
# This will include cross-interaction terms to take into account non-linear combinations
# of clinical variables.
print 'Generating polynomial combination of features ...'
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
all_pts_X1_pruned = X
all_pts_X1_labels = np.array(poly.get_feature_names(all_pts_X1_labels))
## Do feature reduction on Model 3
print 'Model 3 ..................'
X = np.copy(all_pts_X3)
y = np.copy(all_pts_y3)
# Perform the first prune using ANOVA F test using mutual information
print 'Feature reduction to %i features ...'%(min(k1,X.shape[1]))
prune1 = SelectKBest(mutual_info_classif, k=min(k1,X.shape[1]))
X = prune1.fit_transform(X,y)
all_pts_X3_labels = all_pts_X3_labels[prune1.get_support()]
# Perform a second prune by selecting features optimally branched using the classifier
print 'Second round of feature reduction ...'
clf = all_pts_classifier3.fit(X,y)
model = SelectFromModel(clf, prefit=True)
X = model.transform(X)
all_pts_X3_labels = all_pts_X3_labels[model.get_support()]
# Generate polynomial (degree 2) with interaction term feature set to account
# for non-linear combinations.
# This will include cross-interaction terms to take into account non-linear combinations
# of clinical variables.
print 'Generating polynomial combination of features ...'
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
all_pts_X3_labels = np.array(poly.get_feature_names(all_pts_X3_labels))
# Perform the third prune using ANOVA F test using mutual information
print 'Final round of feature reduction to %i features ...'%k2
prune2 = SelectKBest(mutual_info_classif, k=k2)
X = prune2.fit_transform(X,y)
all_pts_X3_pruned = X
all_pts_X3_labels = all_pts_X3_labels[prune2.get_support()]
colors = ['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange']
lw = 2
# Train and test Model 1
all_pts_X1_train = np.squeeze(all_pts_X1_pruned[all_pts_df1_train_idx,:])
all_pts_y1_train = all_pts_y1[all_pts_df1_train_idx]
all_pts_X1_test = np.squeeze(all_pts_X1_pruned[all_pts_df1_test_idx,:])
all_pts_y1_test = all_pts_y1[all_pts_df1_test_idx]
all_pts_y1_hat = all_pts_classifier1.fit(all_pts_X1_train,all_pts_y1_train).predict_proba(all_pts_X1_test)
fpr, tpr, thresholds = roc_curve(all_pts_y1_test,all_pts_y1_hat[:,1])
print fpr
print tpr
print thresholds
roc_auc = auc(fpr, tpr)
i = 1
plt.plot(fpr, tpr, lw=lw, color=colors[i], linestyle='--', label = 'Model %d (area = %0.2f)'%(i,roc_auc), alpha=0.5)
# Train and test Model 3
all_pts_X3_train = np.squeeze(all_pts_X3_pruned[all_pts_df3_train_idx,:])
all_pts_y3_train = all_pts_y3[all_pts_df3_train_idx]
all_pts_X3_test = np.squeeze(all_pts_X3_pruned[all_pts_df3_test_idx,:])
all_pts_y3_test = all_pts_y3[all_pts_df3_test_idx]
all_pts_y3_hat = all_pts_classifier3.fit(all_pts_X3_train,all_pts_y3_train).predict_proba(all_pts_X3_test)
fpr, tpr, thresholds = roc_curve(all_pts_y3_test,all_pts_y3_hat[:,1])
print fpr
print tpr
print thresholds
roc_auc = auc(fpr, tpr)
i = 3
plt.plot(fpr, tpr, lw=lw, color=colors[i+1], linestyle='--', label = 'Model %d (area = %0.2f)'%(i,roc_auc), alpha=0.5)
# Plot the ROC curve for luck along with area
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title('Receiver operating characteristic')
plt.xlabel('False Postive Rate')
plt.ylabel('Trupe Postive Rate')
plt.legend(loc="lower right")
plt.show()
In [ ]:
# Parameters
k1 = 350 # Number of features remaining after first round of feature reduction
k2 = 350 # Number of features remaining after second round of feature reduction
# Load DataFrame
df = pd.read_csv('../data/qPET_feature_matrix_clinical_3dspp_voxel_ai.csv')
# Clean DataFrame by removing patients that had abnormally noisy imaging features
mask = df['id'].isin(['PET75','PET77'])
df = df[~mask]
# Generate feature matrix and target vectors
print 'Generating feature matrix ...'
X = np.array(df[df.columns.difference(['Unnamed: 0','id','outcome_binary'])])
y = np.array(df.outcome_binary-1)
# Reduce feature dimension by PCA
pca = PCA(n_components=X.shape[1])
X = pca.fit_transform(X)
# Build a Random Forest with 5000 estimators
classifier_all = RandomForestClassifier(n_estimators=1000)
# classifier_all = ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=2)
# # Perform the first prune using ANOVA F test using mutual information
# print 'Feature reduction to %i features ...'%(min(k1,X.shape[1]))
# prune1 = SelectKBest(mutual_info_classif, k=min(k1,X.shape[1]))
# X = prune1.fit_transform(X,y)
# # Perform a second prune by selecting features optimally branched using the classifier
# print 'Second round of feature reduction ...'
# clf = classifier_all.fit(X,y)
# model = SelectFromModel(clf, prefit=True)
# X = model.transform(X)
# Generate polynomial (degree 2) with interaction term feature set to account
# for non-linear combinations.
# This will include cross-interaction terms to take into account non-linear combinations
# of clinical variables.
print 'Generating polynomial combination of features ...'
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
# Perform the third prune using ANOVA F test using mutual information
print 'Final round of feature reduction to %i features ...'%(min(k2, X.shape[1]))
prune2 = SelectKBest(mutual_info_classif, k=min(k2, X.shape[1]))
X = prune2.fit_transform(X,y)
# Compute 8-fold cross-validation True Positive Rate (TPR) and False Positive Rate (FPR)
# to generate ROC curves.
print 'Performing cross validation ...'
n_splits = 8
cv = StratifiedKFold(n_splits=n_splits)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
for (train, test) in cv.split(X,y):
# Ignore any folds that do not have any poor outcomes
# to maintain representation of entire dataset.
if(sum(y[test]) == 0):
continue
probas_ = classifier_all.fit(X[train], y[train]).predict_proba(X[test])
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
# Compute mean Area Under Curve (AUC)
print 'Computing AUC of ROC ...'
mean_tpr /= n_splits
mean_tpr[-1] = 1.0
mean_auc_all = auc(mean_fpr, mean_tpr)
sigma_auc_all = 2*np.sqrt(mean_auc_all*(1-mean_auc_all)/4)
# Compute ROC characteristics for a random train test split
train = [5,6,8,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]
test = [0,1,2,3,4,7,9,12]
probas_ = classifier_all.fit(X[train], y[train]).predict_proba(X[test])
fpr_all, tpr_all, thresholds_all = roc_curve(y[test], probas_[:, 1])
# Compute Mean AUC and Confidence interval using \sigma_max
display(Latex("Mean AUC: %0.3f $\pm$ %0.3f"%(mean_auc_all,sigma_auc_all)))